Parser/pgen/automata.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371

"""Classes representing state-machine concepts"""

class NFA:
    """A non deterministic finite automata

    A non deterministic automata is a form of a finite state
    machine. An NFA's rules are less restrictive than a DFA.
    The NFA rules are:

      * A transition can be non-deterministic and can result in
        nothing, one, or two or more states.

      * An epsilon transition consuming empty input is valid.
        Transitions consuming labeled symbols are also permitted.

    This class assumes that there is only one starting state and one
    accepting (ending) state.

    Attributes:
        name (str): The name of the rule the NFA is representing.
        start (NFAState): The starting state.
        end (NFAState): The ending state
    """

    def __init__(self, start, end):
        self.name = start.rule_name
        self.start = start
        self.end = end

    def __repr__(self):
        return "NFA(start={}, end={})".format(self.start, self.end)

    def dump(self, writer=print):
        """Dump a graphical representation of the NFA"""
        todo = [self.start]
        for i, state in enumerate(todo):
            writer("  State", i, state is self.end and "(final)" or "")
            for arc in state.arcs:
                label = arc.label
                next = arc.target
                if next in todo:
                    j = todo.index(next)
                else:
                    j = len(todo)
                    todo.append(next)
                if label is None:
                    writer("    -> %d" % j)
                else:
                    writer("    %s -> %d" % (label, j))


class NFAArc:
    """An arc representing a transition between two NFA states.

    NFA states can be connected via two ways:

        * A label transition: An input equal to the label must
          be consumed to perform the transition.
        * An epsilon transition: The transition can be taken without
          consuming any input symbol.

        Attributes:
            target (NFAState): The ending state of the transition arc.
            label (Optional[str]): The label that must be consumed to make
                the transition. An epsilon transition is represented
                using `None`.
    """

    def __init__(self, target, label):
        self.target = target
        self.label = label

    def __repr__(self):
        return "<%s: %s>" % (self.__class__.__name__, self.label)


class NFAState:
    """A state of a NFA, non deterministic finite automata.

    Attributes:
        target (rule_name): The name of the rule used to represent the NFA's
            ending state after a transition.
        arcs (Dict[Optional[str], NFAState]): A mapping representing transitions
            between the current NFA state and another NFA state via following
            a label.
    """

    def __init__(self, rule_name):
        self.rule_name = rule_name
        self.arcs = []

    def add_arc(self, target, label=None):
        """Add a new arc to connect the state to a target state within the NFA

        The method adds a new arc to the list of arcs available as transitions
        from the present state. An optional label indicates a named transition
        that consumes an input while the absence of a label represents an epsilon
        transition.

        Attributes:
            target (NFAState): The end of the transition that the arc represents.
            label (Optional[str]): The label that must be consumed for making
                the transition. If the label is not provided the transition is assumed
                to be an epsilon-transition.
        """
        assert label is None or isinstance(label, str)
        assert isinstance(target, NFAState)
        self.arcs.append(NFAArc(target, label))

    def __repr__(self):
        return "<%s: from %s>" % (self.__class__.__name__, self.rule_name)


class DFA:
    """A deterministic finite automata

    A deterministic finite automata is a form of a finite state machine
    that obeys the following rules:

       * Each of the transitions is uniquely determined by
         the source state and input symbol
       * Reading an input symbol is required for each state
         transition (no epsilon transitions).

    The finite-state machine will accept or reject a string of symbols
    and only produces a unique computation of the automaton for each input
    string. The DFA must have a unique starting state (represented as the first
    element in the list of states) but can have multiple final states.

    Attributes:
        name (str): The name of the rule the DFA is representing.
        states (List[DFAState]): A collection of DFA states.
    """

    def __init__(self, name, states):
        self.name = name
        self.states = states

    @classmethod
    def from_nfa(cls, nfa):
        """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm.

        To simulate the operation of a DFA on a given input string, it's
        necessary to keep track of a single state at any time, or more precisely,
        the state that the automaton will reach after seeing a prefix of the
        input. In contrast, to simulate an NFA, it's necessary to keep track of
        a set of states: all of the states that the automaton could reach after
        seeing the same prefix of the input, according to the nondeterministic
        choices made by the automaton. There are two possible sources of
        non-determinism:

        1) Multiple (one or more) transitions with the same label

                         'A'     +-------+
                    +----------->+ State +----------->+
                    |            |   2   |
            +-------+            +-------+
            | State |
            |   1   |            +-------+
            +-------+            | State |
                    +----------->+   3   +----------->+
                         'A'     +-------+

        2) Epsilon transitions (transitions that can be taken without consuming any input)

            +-------+            +-------+
            | State |     ε      | State |
            |   1   +----------->+   2   +----------->+
            +-------+            +-------+

        Looking at the first case above, we can't determine which transition should be
        followed when given an input A. We could choose whether or not to follow the
        transition while in the second case the problem is that we can choose both to
        follow the transition or not doing it. To solve this problem we can imagine that
        we follow all possibilities at the same time and we construct new states from the
        set of all possible reachable states. For every case in the previous example:


        1) For multiple transitions with the same label we colapse all of the
           final states under the same one

            +-------+            +-------+
            | State |     'A'    | State |
            |   1   +----------->+  2-3  +----------->+
            +-------+            +-------+

        2) For epsilon transitions we collapse all epsilon-reachable states
           into the same one

            +-------+
            | State |
            |  1-2  +----------->
            +-------+

        Because the DFA states consist of sets of NFA states, an n-state NFA
        may be converted to a DFA with at most 2**n states. Notice that the
        constructed DFA is not minimal and can be simplified or reduced
        afterwards.

        Parameters:
            name (NFA): The NFA to transform to DFA.
        """
        assert isinstance(nfa, NFA)

        def add_closure(nfa_state, base_nfa_set):
            """Calculate the epsilon-closure of a given state

            Add to the *base_nfa_set* all the states that are
            reachable from *nfa_state* via epsilon-transitions.
            """
            assert isinstance(nfa_state, NFAState)
            if nfa_state in base_nfa_set:
                return
            base_nfa_set.add(nfa_state)
            for nfa_arc in nfa_state.arcs:
                if nfa_arc.label is None:
                    add_closure(nfa_arc.target, base_nfa_set)

        # Calculte the epsilon-closure of the starting state
        base_nfa_set = set()
        add_closure(nfa.start, base_nfa_set)

        # Start by visiting the NFA starting state (there is only one).
        states = [DFAState(nfa.name, base_nfa_set, nfa.end)]

        for state in states:  # NB states grow while we're iterating

            # Find transitions from the current state to other reachable states
            # and store them in mapping that correlates the label to all the
            # possible reachable states that can be obtained by consuming a
            # token equal to the label. Each set of all the states that can
            # be reached after following a label will be the a DFA state.
            arcs = {}
            for nfa_state in state.nfa_set:
                for nfa_arc in nfa_state.arcs:
                    if nfa_arc.label is not None:
                        nfa_set = arcs.setdefault(nfa_arc.label, set())
                        # All states that can be reached by epsilon-transitions
                        # are also included in the set of reachable states.
                        add_closure(nfa_arc.target, nfa_set)

            # Now create new DFAs by visiting all posible transitions between
            # the current DFA state and the new power-set states (each nfa_set)
            # via the different labels. As the nodes are appended to *states* this
            # is performing a deep-first search traversal over the power-set of
            # the states of the original NFA.
            for label, nfa_set in sorted(arcs.items()):
                for exisisting_state in states:
                    if exisisting_state.nfa_set == nfa_set:
                        # The DFA state already exists for this rule.
                        next_state = exisisting_state
                        break
                else:
                    next_state = DFAState(nfa.name, nfa_set, nfa.end)
                    states.append(next_state)

                # Add a transition between the current DFA state and the new
                # DFA state (the power-set state) via the current label.
                state.add_arc(next_state, label)

        return cls(nfa.name, states)

    def __iter__(self):
        return iter(self.states)

    def simplify(self):
        """Attempt to reduce the number of states of the DFA

        Transform the DFA into an equivalent DFA that has fewer states. Two
        classes of states can be removed or merged from the original DFA without
        affecting the language it accepts to minimize it:

            * Unreachable states can not be reached from the initial
              state of the DFA, for any input string.
            * Nondistinguishable states are those that cannot be distinguished
              from one another for any input string.

        This algorithm does not achieve the optimal fully-reduced solution, but it
        works well enough for the particularities of the Python grammar. The
        algorithm repeatedly looks for two states that have the same set of
        arcs (same labels pointing to the same nodes) and unifies them, until
        things stop changing.
        """
        changes = True
        while changes:
            changes = False
            for i, state_i in enumerate(self.states):
                for j in range(i + 1, len(self.states)):
                    state_j = self.states[j]
                    if state_i == state_j:
                        del self.states[j]
                        for state in self.states:
                            state.unifystate(state_j, state_i)
                        changes = True
                        break

    def dump(self, writer=print):
        """Dump a graphical representation of the DFA"""
        for i, state in enumerate(self.states):
            writer("  State", i, state.is_final and "(final)" or "")
            for label, next in sorted(state.arcs.items()):
                writer("    %s -> %d" % (label, self.states.index(next)))


class DFAState(object):
    """A state of a DFA

    Attributes:
        rule_name (rule_name): The name of the DFA rule containing the represented state.
        nfa_set (Set[NFAState]): The set of NFA states used to create this state.
        final (bool): True if the state represents an accepting state of the DFA
            containing this state.
        arcs (Dict[label, DFAState]): A mapping representing transitions between
            the current DFA state and another DFA state via following a label.
    """

    def __init__(self, rule_name, nfa_set, final):
        assert isinstance(nfa_set, set)
        assert isinstance(next(iter(nfa_set)), NFAState)
        assert isinstance(final, NFAState)
        self.rule_name = rule_name
        self.nfa_set = nfa_set
        self.arcs = {}  # map from terminals/nonterminals to DFAState
        self.is_final = final in nfa_set

    def add_arc(self, target, label):
        """Add a new arc to the current state.

        Parameters:
            target (DFAState): The DFA state at the end of the arc.
            label (str): The label respresenting the token that must be consumed
                to perform this transition.
        """
        assert isinstance(label, str)
        assert label not in self.arcs
        assert isinstance(target, DFAState)
        self.arcs[label] = target

    def unifystate(self, old, new):
        """Replace all arcs from the current node to *old* with *new*.

        Parameters:
            old (DFAState): The  DFA state to remove from all existing arcs.
            new (DFAState): The DFA state to replace in all existing arcs.
        """
        for label, next_ in self.arcs.items():
            if next_ is old:
                self.arcs[label] = new

    def __eq__(self, other):
        # The nfa_set does not matter for  equality
        assert isinstance(other, DFAState)
        if self.is_final != other.is_final:
            return False
        # We cannot just return self.arcs == other.arcs because that
        # would invoke this method recursively if there are any cycles.
        if len(self.arcs) != len(other.arcs):
            return False
        for label, next_ in self.arcs.items():
            if next_ is not other.arcs.get(label):
                return False
        return True

    __hash__ = None  # For Py3 compatibility.

    def __repr__(self):
        return "<%s: %s is_final=%s>" % (
            self.__class__.__name__,
            self.rule_name,
            self.is_final,
        )