diff options
Diffstat (limited to 'Parser/pgen/automata.py')
-rw-r--r-- | Parser/pgen/automata.py | 371 |
1 files changed, 371 insertions, 0 deletions
diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py new file mode 100644 index 0000000..3147d86 --- /dev/null +++ b/Parser/pgen/automata.py @@ -0,0 +1,371 @@ +"""Classes representing state-machine concepts""" + +class NFA: + """A non deterministic finite automata + + A non deterministic automata is a form of a finite state + machine. An NFA's rules are less restrictive than a DFA. + The NFA rules are: + + * A transition can be non-deterministic and can result in + nothing, one, or two or more states. + + * An epsilon transition consuming empty input is valid. + Transitions consuming labeled symbols are also permitted. + + This class assumes that there is only one starting state and one + accepting (ending) state. + + Attributes: + name (str): The name of the rule the NFA is representing. + start (NFAState): The starting state. + end (NFAState): The ending state + """ + + def __init__(self, start, end): + self.name = start.rule_name + self.start = start + self.end = end + + def __repr__(self): + return "NFA(start={}, end={})".format(self.start, self.end) + + def dump(self, writer=print): + """Dump a graphical representation of the NFA""" + todo = [self.start] + for i, state in enumerate(todo): + writer(" State", i, state is self.end and "(final)" or "") + for arc in state.arcs: + label = arc.label + next = arc.target + if next in todo: + j = todo.index(next) + else: + j = len(todo) + todo.append(next) + if label is None: + writer(" -> %d" % j) + else: + writer(" %s -> %d" % (label, j)) + + +class NFAArc: + """An arc representing a transition between two NFA states. + + NFA states can be connected via two ways: + + * A label transition: An input equal to the label must + be consumed to perform the transition. + * An epsilon transition: The transition can be taken without + consuming any input symbol. + + Attributes: + target (NFAState): The ending state of the transition arc. + label (Optional[str]): The label that must be consumed to make + the transition. An epsilon transition is represented + using `None`. + """ + + def __init__(self, target, label): + self.target = target + self.label = label + + def __repr__(self): + return "<%s: %s>" % (self.__class__.__name__, self.label) + + +class NFAState: + """A state of a NFA, non deterministic finite automata. + + Attributes: + target (rule_name): The name of the rule used to represent the NFA's + ending state after a transition. + arcs (Dict[Optional[str], NFAState]): A mapping representing transitions + between the current NFA state and another NFA state via following + a label. + """ + + def __init__(self, rule_name): + self.rule_name = rule_name + self.arcs = [] + + def add_arc(self, target, label=None): + """Add a new arc to connect the state to a target state within the NFA + + The method adds a new arc to the list of arcs available as transitions + from the present state. An optional label indicates a named transition + that consumes an input while the absence of a label represents an epsilon + transition. + + Attributes: + target (NFAState): The end of the transition that the arc represents. + label (Optional[str]): The label that must be consumed for making + the transition. If the label is not provided the transition is assumed + to be an epsilon-transition. + """ + assert label is None or isinstance(label, str) + assert isinstance(target, NFAState) + self.arcs.append(NFAArc(target, label)) + + def __repr__(self): + return "<%s: from %s>" % (self.__class__.__name__, self.rule_name) + + +class DFA: + """A deterministic finite automata + + A deterministic finite automata is a form of a finite state machine + that obeys the following rules: + + * Each of the transitions is uniquely determined by + the source state and input symbol + * Reading an input symbol is required for each state + transition (no epsilon transitions). + + The finite-state machine will accept or reject a string of symbols + and only produces a unique computation of the automaton for each input + string. The DFA must have a unique starting state (represented as the first + element in the list of states) but can have multiple final states. + + Attributes: + name (str): The name of the rule the DFA is representing. + states (List[DFAState]): A collection of DFA states. + """ + + def __init__(self, name, states): + self.name = name + self.states = states + + @classmethod + def from_nfa(cls, nfa): + """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm. + + To simulate the operation of a DFA on a given input string, it's + necessary to keep track of a single state at any time, or more precisely, + the state that the automaton will reach after seeing a prefix of the + input. In contrast, to simulate an NFA, it's necessary to keep track of + a set of states: all of the states that the automaton could reach after + seeing the same prefix of the input, according to the nondeterministic + choices made by the automaton. There are two possible sources of + non-determinism: + + 1) Multiple (one or more) transitions with the same label + + 'A' +-------+ + +----------->+ State +----------->+ + | | 2 | + +-------+ +-------+ + | State | + | 1 | +-------+ + +-------+ | State | + +----------->+ 3 +----------->+ + 'A' +-------+ + + 2) Epsilon transitions (transitions that can be taken without consuming any input) + + +-------+ +-------+ + | State | ε | State | + | 1 +----------->+ 2 +----------->+ + +-------+ +-------+ + + Looking at the first case above, we can't determine which transition should be + followed when given an input A. We could choose whether or not to follow the + transition while in the second case the problem is that we can choose both to + follow the transition or not doing it. To solve this problem we can imagine that + we follow all possibilities at the same time and we construct new states from the + set of all possible reachable states. For every case in the previous example: + + + 1) For multiple transitions with the same label we colapse all of the + final states under the same one + + +-------+ +-------+ + | State | 'A' | State | + | 1 +----------->+ 2-3 +----------->+ + +-------+ +-------+ + + 2) For epsilon transitions we collapse all epsilon-reachable states + into the same one + + +-------+ + | State | + | 1-2 +-----------> + +-------+ + + Because the DFA states consist of sets of NFA states, an n-state NFA + may be converted to a DFA with at most 2**n states. Notice that the + constructed DFA is not minimal and can be simplified or reduced + afterwards. + + Parameters: + name (NFA): The NFA to transform to DFA. + """ + assert isinstance(nfa, NFA) + + def add_closure(nfa_state, base_nfa_set): + """Calculate the epsilon-closure of a given state + + Add to the *base_nfa_set* all the states that are + reachable from *nfa_state* via epsilon-transitions. + """ + assert isinstance(nfa_state, NFAState) + if nfa_state in base_nfa_set: + return + base_nfa_set.add(nfa_state) + for nfa_arc in nfa_state.arcs: + if nfa_arc.label is None: + add_closure(nfa_arc.target, base_nfa_set) + + # Calculte the epsilon-closure of the starting state + base_nfa_set = set() + add_closure(nfa.start, base_nfa_set) + + # Start by visiting the NFA starting state (there is only one). + states = [DFAState(nfa.name, base_nfa_set, nfa.end)] + + for state in states: # NB states grow while we're iterating + + # Find transitions from the current state to other reachable states + # and store them in mapping that correlates the label to all the + # possible reachable states that can be obtained by consuming a + # token equal to the label. Each set of all the states that can + # be reached after following a label will be the a DFA state. + arcs = {} + for nfa_state in state.nfa_set: + for nfa_arc in nfa_state.arcs: + if nfa_arc.label is not None: + nfa_set = arcs.setdefault(nfa_arc.label, set()) + # All states that can be reached by epsilon-transitions + # are also included in the set of reachable states. + add_closure(nfa_arc.target, nfa_set) + + # Now create new DFAs by visiting all posible transitions between + # the current DFA state and the new power-set states (each nfa_set) + # via the different labels. As the nodes are appended to *states* this + # is performing a deep-first search traversal over the power-set of + # the states of the original NFA. + for label, nfa_set in sorted(arcs.items()): + for exisisting_state in states: + if exisisting_state.nfa_set == nfa_set: + # The DFA state already exists for this rule. + next_state = exisisting_state + break + else: + next_state = DFAState(nfa.name, nfa_set, nfa.end) + states.append(next_state) + + # Add a transition between the current DFA state and the new + # DFA state (the power-set state) via the current label. + state.add_arc(next_state, label) + + return cls(nfa.name, states) + + def __iter__(self): + return iter(self.states) + + def simplify(self): + """Attempt to reduce the number of states of the DFA + + Transform the DFA into an equivalent DFA that has fewer states. Two + classes of states can be removed or merged from the original DFA without + affecting the language it accepts to minimize it: + + * Unreachable states can not be reached from the initial + state of the DFA, for any input string. + * Nondistinguishable states are those that cannot be distinguished + from one another for any input string. + + This algorithm does not achieve the optimal fully-reduced solution, but it + works well enough for the particularities of the Python grammar. The + algorithm repeatedly looks for two states that have the same set of + arcs (same labels pointing to the same nodes) and unifies them, until + things stop changing. + """ + changes = True + while changes: + changes = False + for i, state_i in enumerate(self.states): + for j in range(i + 1, len(self.states)): + state_j = self.states[j] + if state_i == state_j: + del self.states[j] + for state in self.states: + state.unifystate(state_j, state_i) + changes = True + break + + def dump(self, writer=print): + """Dump a graphical representation of the DFA""" + for i, state in enumerate(self.states): + writer(" State", i, state.is_final and "(final)" or "") + for label, next in sorted(state.arcs.items()): + writer(" %s -> %d" % (label, self.states.index(next))) + + +class DFAState(object): + """A state of a DFA + + Attributes: + rule_name (rule_name): The name of the DFA rule containing the represented state. + nfa_set (Set[NFAState]): The set of NFA states used to create this state. + final (bool): True if the state represents an accepting state of the DFA + containing this state. + arcs (Dict[label, DFAState]): A mapping representing transitions between + the current DFA state and another DFA state via following a label. + """ + + def __init__(self, rule_name, nfa_set, final): + assert isinstance(nfa_set, set) + assert isinstance(next(iter(nfa_set)), NFAState) + assert isinstance(final, NFAState) + self.rule_name = rule_name + self.nfa_set = nfa_set + self.arcs = {} # map from terminals/nonterminals to DFAState + self.is_final = final in nfa_set + + def add_arc(self, target, label): + """Add a new arc to the current state. + + Parameters: + target (DFAState): The DFA state at the end of the arc. + label (str): The label respresenting the token that must be consumed + to perform this transition. + """ + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(target, DFAState) + self.arcs[label] = target + + def unifystate(self, old, new): + """Replace all arcs from the current node to *old* with *new*. + + Parameters: + old (DFAState): The DFA state to remove from all existing arcs. + new (DFAState): The DFA state to replace in all existing arcs. + """ + for label, next_ in self.arcs.items(): + if next_ is old: + self.arcs[label] = new + + def __eq__(self, other): + # The nfa_set does not matter for equality + assert isinstance(other, DFAState) + if self.is_final != other.is_final: + return False + # We cannot just return self.arcs == other.arcs because that + # would invoke this method recursively if there are any cycles. + if len(self.arcs) != len(other.arcs): + return False + for label, next_ in self.arcs.items(): + if next_ is not other.arcs.get(label): + return False + return True + + __hash__ = None # For Py3 compatibility. + + def __repr__(self): + return "<%s: %s is_final=%s>" % ( + self.__class__.__name__, + self.rule_name, + self.is_final, + ) |