Parser/pgen/grammar.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

import collections


class Grammar:
    """Pgen parsing tables class.

    The instance variables are as follows:

    symbol2number -- a dict mapping symbol names to numbers.  Symbol
                     numbers are always 256 or higher, to distinguish
                     them from token numbers, which are between 0 and
                     255 (inclusive).

    number2symbol -- a dict mapping numbers to symbol names;
                     these two are each other's inverse.

    states        -- a list of DFAs, where each DFA is a list of
                     states, each state is a list of arcs, and each
                     arc is a (i, j) pair where i is a label and j is
                     a state number.  The DFA number is the index into
                     this list.  (This name is slightly confusing.)
                     Final states are represented by a special arc of
                     the form (0, j) where j is its own state number.

    dfas          -- a dict mapping symbol numbers to (DFA, first)
                     pairs, where DFA is an item from the states list
                     above, and first is a set of tokens that can
                     begin this grammar rule.

    labels        -- a list of (x, y) pairs where x is either a token
                     number or a symbol number, and y is either None
                     or a string; the strings are keywords.  The label
                     number is the index in this list; label numbers
                     are used to mark state transitions (arcs) in the
                     DFAs.

    start         -- the number of the grammar's start symbol.

    keywords      -- a dict mapping keyword strings to arc labels.

    tokens        -- a dict mapping token numbers to arc labels.

    """

    def __init__(self):
        self.symbol2number = collections.OrderedDict()
        self.number2symbol = collections.OrderedDict()
        self.states = []
        self.dfas = collections.OrderedDict()
        self.labels = [(0, "EMPTY")]
        self.keywords = collections.OrderedDict()
        self.tokens = collections.OrderedDict()
        self.symbol2label = collections.OrderedDict()
        self.start = 256

    def produce_graminit_h(self, writer):
        writer("/* Generated by Parser/pgen */\n\n")
        for number, symbol in self.number2symbol.items():
            writer("#define {} {}\n".format(symbol, number))

    def produce_graminit_c(self, writer):
        writer("/* Generated by Parser/pgen */\n\n")

        writer('#include "grammar.h"\n')
        writer("grammar _PyParser_Grammar;\n")

        self.print_dfas(writer)
        self.print_labels(writer)

        writer("grammar _PyParser_Grammar = {\n")
        writer("    {n_dfas},\n".format(n_dfas=len(self.dfas)))
        writer("    dfas,\n")
        writer("    {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
        writer("    {start_number}\n".format(start_number=self.start))
        writer("};\n")

    def print_labels(self, writer):
        writer(
            "static const label labels[{n_labels}] = {{\n".format(
                n_labels=len(self.labels)
            )
        )
        for label, name in self.labels:
            label_name = '"{}"'.format(name) if name is not None else 0
            writer(
                "    {{{label}, {label_name}}},\n".format(
                    label=label, label_name=label_name
                )
            )
        writer("};\n")

    def print_dfas(self, writer):
        self.print_states(writer)
        writer("static const dfa dfas[{}] = {{\n".format(len(self.dfas)))
        for dfaindex, dfa_elem in enumerate(self.dfas.items()):
            symbol, (dfa, first_sets) = dfa_elem
            writer(
                '    {{{dfa_symbol}, "{symbol_name}", '.format(
                    dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
                )
                + "{n_states}, states_{dfa_index},\n".format(
                    n_states=len(dfa), dfa_index=dfaindex
                )
                + '     "'
            )

            bitset = bytearray((len(self.labels) >> 3) + 1)
            for token in first_sets:
                bitset[token >> 3] |= 1 << (token & 7)
            for byte in bitset:
                writer("\\%03o" % (byte & 0xFF))
            writer('"},\n')
        writer("};\n")

    def print_states(self, write):
        for dfaindex, dfa in enumerate(self.states):
            self.print_arcs(write, dfaindex, dfa)
            write(
                "static state states_{dfa_index}[{n_states}] = {{\n".format(
                    dfa_index=dfaindex, n_states=len(dfa)
                )
            )
            for stateindex, state in enumerate(dfa):
                narcs = len(state)
                write(
                    "    {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
                        n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
                    )
                )
            write("};\n")

    def print_arcs(self, write, dfaindex, states):
        for stateindex, state in enumerate(states):
            narcs = len(state)
            write(
                "static const arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
                    dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
                )
            )
            for a, b in state:
                write(
                    "    {{{from_label}, {to_state}}},\n".format(
                        from_label=a, to_state=b
                    )
                )
            write("};\n")