Lib/lib2to3/pgen2/conv.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257

# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.

"""Convert graminit.[ch] spit out by pgen to Python code.

Pgen is the Python parser generator.  It is useful to quickly create a
parser from a grammar file in Python's grammar notation.  But I don't
want my parsers to be written in C (yet), so I'm translating the
parsing tables to Python data structures and writing a Python parse
engine.

Note that the token numbers are constants determined by the standard
Python tokenizer.  The standard token module defines these numbers and
their names (the names are not used much).  The token numbers are
hardcoded into the Python tokenizer and into pgen.  A Python
implementation of the Python tokenizer is also available, in the
standard tokenize module.

On the other hand, symbol numbers (representing the grammar's
non-terminals) are assigned by pgen based on the actual grammar
input.

Note: this module is pretty much obsolete; the pgen module generates
equivalent grammar tables directly from the Grammar.txt input file
without having to invoke the Python pgen C program.

"""

# Python imports
import re

# Local imports
from pgen2 import grammar, token


class Converter(grammar.Grammar):
    """Grammar subclass that reads classic pgen output files.

    The run() method reads the tables as produced by the pgen parser
    generator, typically contained in two C files, graminit.h and
    graminit.c.  The other methods are for internal use only.

    See the base class for more documentation.

    """

    def run(self, graminit_h, graminit_c):
        """Load the grammar tables from the text files written by pgen."""
        self.parse_graminit_h(graminit_h)
        self.parse_graminit_c(graminit_c)
        self.finish_off()

    def parse_graminit_h(self, filename):
        """Parse the .h file written by pgen.  (Internal)

        This file is a sequence of #define statements defining the
        nonterminals of the grammar as numbers.  We build two tables
        mapping the numbers to names and back.

        """
        try:
            f = open(filename)
        except OSError as err:
            print("Can't open %s: %s" % (filename, err))
            return False
        self.symbol2number = {}
        self.number2symbol = {}
        lineno = 0
        for line in f:
            lineno += 1
            mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
            if not mo and line.strip():
                print("%s(%s): can't parse %s" % (filename, lineno,
                                                  line.strip()))
            else:
                symbol, number = mo.groups()
                number = int(number)
                assert symbol not in self.symbol2number
                assert number not in self.number2symbol
                self.symbol2number[symbol] = number
                self.number2symbol[number] = symbol
        return True

    def parse_graminit_c(self, filename):
        """Parse the .c file written by pgen.  (Internal)

        The file looks as follows.  The first two lines are always this:

        #include "pgenheaders.h"
        #include "grammar.h"

        After that come four blocks:

        1) one or more state definitions
        2) a table defining dfas
        3) a table defining labels
        4) a struct defining the grammar

        A state definition has the following form:
        - one or more arc arrays, each of the form:
          static arc arcs_<n>_<m>[<k>] = {
                  {<i>, <j>},
                  ...
          };
        - followed by a state array, of the form:
          static state states_<s>[<t>] = {
                  {<k>, arcs_<n>_<m>},
                  ...
          };

        """
        try:
            f = open(filename)
        except OSError as err:
            print("Can't open %s: %s" % (filename, err))
            return False
        # The code below essentially uses f's iterator-ness!
        lineno = 0

        # Expect the two #include lines
        lineno, line = lineno+1, next(f)
        assert line == '#include "pgenheaders.h"\n', (lineno, line)
        lineno, line = lineno+1, next(f)
        assert line == '#include "grammar.h"\n', (lineno, line)

        # Parse the state definitions
        lineno, line = lineno+1, next(f)
        allarcs = {}
        states = []
        while line.startswith("static arc "):
            while line.startswith("static arc "):
                mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
                              line)
                assert mo, (lineno, line)
                n, m, k = list(map(int, mo.groups()))
                arcs = []
                for _ in range(k):
                    lineno, line = lineno+1, next(f)
                    mo = re.match(r"\s+{(\d+), (\d+)},$", line)
                    assert mo, (lineno, line)
                    i, j = list(map(int, mo.groups()))
                    arcs.append((i, j))
                lineno, line = lineno+1, next(f)
                assert line == "};\n", (lineno, line)
                allarcs[(n, m)] = arcs
                lineno, line = lineno+1, next(f)
            mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
            assert mo, (lineno, line)
            s, t = list(map(int, mo.groups()))
            assert s == len(states), (lineno, line)
            state = []
            for _ in range(t):
                lineno, line = lineno+1, next(f)
                mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
                assert mo, (lineno, line)
                k, n, m = list(map(int, mo.groups()))
                arcs = allarcs[n, m]
                assert k == len(arcs), (lineno, line)
                state.append(arcs)
            states.append(state)
            lineno, line = lineno+1, next(f)
            assert line == "};\n", (lineno, line)
            lineno, line = lineno+1, next(f)
        self.states = states

        # Parse the dfas
        dfas = {}
        mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
        assert mo, (lineno, line)
        ndfas = int(mo.group(1))
        for i in range(ndfas):
            lineno, line = lineno+1, next(f)
            mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
                          line)
            assert mo, (lineno, line)
            symbol = mo.group(2)
            number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
            assert self.symbol2number[symbol] == number, (lineno, line)
            assert self.number2symbol[number] == symbol, (lineno, line)
            assert x == 0, (lineno, line)
            state = states[z]
            assert y == len(state), (lineno, line)
            lineno, line = lineno+1, next(f)
            mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
            assert mo, (lineno, line)
            first = {}
            rawbitset = eval(mo.group(1))
            for i, c in enumerate(rawbitset):
                byte = ord(c)
                for j in range(8):
                    if byte & (1<<j):
                        first[i*8 + j] = 1
            dfas[number] = (state, first)
        lineno, line = lineno+1, next(f)
        assert line == "};\n", (lineno, line)
        self.dfas = dfas

        # Parse the labels
        labels = []
        lineno, line = lineno+1, next(f)
        mo = re.match(r"static label labels\[(\d+)\] = {$", line)
        assert mo, (lineno, line)
        nlabels = int(mo.group(1))
        for i in range(nlabels):
            lineno, line = lineno+1, next(f)
            mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
            assert mo, (lineno, line)
            x, y = mo.groups()
            x = int(x)
            if y == "0":
                y = None
            else:
                y = eval(y)
            labels.append((x, y))
        lineno, line = lineno+1, next(f)
        assert line == "};\n", (lineno, line)
        self.labels = labels

        # Parse the grammar struct
        lineno, line = lineno+1, next(f)
        assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
        lineno, line = lineno+1, next(f)
        mo = re.match(r"\s+(\d+),$", line)
        assert mo, (lineno, line)
        ndfas = int(mo.group(1))
        assert ndfas == len(self.dfas)
        lineno, line = lineno+1, next(f)
        assert line == "\tdfas,\n", (lineno, line)
        lineno, line = lineno+1, next(f)
        mo = re.match(r"\s+{(\d+), labels},$", line)
        assert mo, (lineno, line)
        nlabels = int(mo.group(1))
        assert nlabels == len(self.labels), (lineno, line)
        lineno, line = lineno+1, next(f)
        mo = re.match(r"\s+(\d+)$", line)
        assert mo, (lineno, line)
        start = int(mo.group(1))
        assert start in self.number2symbol, (lineno, line)
        self.start = start
        lineno, line = lineno+1, next(f)
        assert line == "};\n", (lineno, line)
        try:
            lineno, line = lineno+1, next(f)
        except StopIteration:
            pass
        else:
            assert 0, (lineno, line)

    def finish_off(self):
        """Create additional useful structures.  (Internal)."""
        self.keywords = {} # map from keyword strings to arc labels
        self.tokens = {}   # map from numeric token values to arc labels
        for ilabel, (type, value) in enumerate(self.labels):
            if type == token.NAME and value is not None:
                self.keywords[value] = ilabel
            elif value is None:
                self.tokens[type] = ilabel