1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
|
"""Python parser generator
This parser generator transforms a Python grammar file into parsing tables
that can be consumed by Python's LL(1) parser written in C.
Concepts
--------
* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
top-down parser for a subset of context-free languages. It parses the input
from Left to right, performing Leftmost derivation of the sentence, and can
only use 1 tokens of lookahead when parsing a sentence.
* A parsing table is a collection of data that a generic implementation of the
LL(1) parser consumes to know how to parse a given context-free grammar. In
this case the collection of thata involves Deterministic Finite Automatons,
calculated first sets, keywords and transition labels.
* A grammar is defined by production rules (or just 'productions') that specify
which symbols may replace which other symbols; these rules may be used to
generate strings, or to parse them. Each such rule has a head, or left-hand
side, which consists of the string that may be replaced, and a body, or
right-hand side, which consists of a string that may replace it. In the
Python grammar, rules are written in the form
rule_name: rule_description;
meaning the rule 'a: b' specifies that a can be replaced by b. A Context-free
grammars is a grammars in which the left-hand side of each production rule
consists of only a single nonterminal symbol. Context free grammars can
always be recognized by a Non-Deterministic Automatons.
* Terminal symbols are literal symbols which may appear in the outputs of the
production rules of the grammar and which cannot be changed using the rules
of the grammar. Applying the rules recursively to a source string of symbols
will usually terminate in a final output string consisting only of terminal
symbols.
* Nonterminal symbols are those symbols which can be replaced. The grammar
includes a start symbol a designated member of the set of nonterminals from
which all the strings in the language may be derived by successive
applications of the production rules.
* The language defined by the grammar is defined as the set of terminal strings
that can be derived using the production rules.
* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
that can appear in the first position of any string derived from the rule.
This is useful for LL(1) parsers as the parser is only allow to look at the
next token in the input to know which rule needs to parse. For example given
this grammar:
start: '(' A | B ')'
A: 'a' '<'
B: 'b' '<'
and the input '(b<)' the parser can only look at 'b' to know if it needs
to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
that needs to continue parsing rule B because only that rule can start
with 'b'.
Description
-----------
The input for the parser generator is a grammar in extended BNF form (using *
for repetition, + for at-least-once repetition, [] for optional parts, | for
alternatives and () for grouping).
Each rule in the grammar file is considered as a regular expression in its
own right. It is turned into a Non-deterministic Finite Automaton (NFA),
which is then turned into a Deterministic Finite Automaton (DFA), which is
then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
or similar compiler books (this technique is more often used for lexical
analyzers).
The DFA's are used by the parser as parsing tables in a special way that's
probably unique. Before they are usable, the FIRST sets of all non-terminals
are computed so the LL(1) parser consuming the parsing tables can distinguish
between different transitions.
Reference
---------
[Aho&Ullman 77]
Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
(first edition)
"""
from ast import literal_eval
import collections
from . import grammar, token
from .automata import DFA
from .metaparser import GrammarParser
import enum
class LabelType(enum.Enum):
NONTERMINAL = 0
NAMED_TOKEN = 1
KEYWORD = 2
OPERATOR = 3
NONE = 4
class Label(str):
def __init__(self, value):
self.type = self._get_type()
def _get_type(self):
if self[0].isalpha():
if self.upper() == self:
# NAMED tokens (ASYNC, NAME...) are all uppercase by convention
return LabelType.NAMED_TOKEN
else:
# If is not uppercase it must be a non terminal.
return LabelType.NONTERMINAL
else:
# Keywords and operators are wrapped in quotes
assert self[0] == self[-1] in ('"', "'"), self
value = literal_eval(self)
if value[0].isalpha():
return LabelType.KEYWORD
else:
return LabelType.OPERATOR
def __repr__(self):
return "{}({})".format(self.type, super().__repr__())
class ParserGenerator(object):
def __init__(self, grammar_file, token_file, verbose=False):
with open(grammar_file) as f:
self.grammar = f.read()
with open(token_file) as tok_file:
token_lines = tok_file.readlines()
self.tokens = dict(token.generate_tokens(token_lines))
self.opmap = dict(token.generate_opmap(token_lines))
# Manually add <> so it does not collide with !=
self.opmap["<>"] = "NOTEQUAL"
self.verbose = verbose
self.filename = grammar_file
self.dfas, self.startsymbol = self.create_dfas()
self.first = {} # map from symbol name to set of tokens
self.calculate_first_sets()
def create_dfas(self):
rule_to_dfas = collections.OrderedDict()
start_nonterminal = None
for nfa in GrammarParser(self.grammar).parse():
if self.verbose:
print("Dump of NFA for", nfa.name)
nfa.dump()
dfa = DFA.from_nfa(nfa)
if self.verbose:
print("Dump of DFA for", dfa.name)
dfa.dump()
dfa.simplify()
rule_to_dfas[dfa.name] = dfa
if start_nonterminal is None:
start_nonterminal = dfa.name
return rule_to_dfas, start_nonterminal
def make_grammar(self):
c = grammar.Grammar()
c.all_labels = set()
names = list(self.dfas.keys())
names.remove(self.startsymbol)
names.insert(0, self.startsymbol)
for name in names:
i = 256 + len(c.symbol2number)
c.symbol2number[Label(name)] = i
c.number2symbol[i] = Label(name)
c.all_labels.add(name)
for name in names:
self.make_label(c, name)
dfa = self.dfas[name]
states = []
for state in dfa:
arcs = []
for label, next in sorted(state.arcs.items()):
c.all_labels.add(label)
arcs.append((self.make_label(c, label), dfa.states.index(next)))
if state.is_final:
arcs.append((0, dfa.states.index(state)))
states.append(arcs)
c.states.append(states)
c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
c.start = c.symbol2number[self.startsymbol]
if self.verbose:
print("")
print("Grammar summary")
print("===============")
print("- {n_labels} labels".format(n_labels=len(c.labels)))
print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
print(
"- Start symbol: {start_symbol}".format(
start_symbol=c.number2symbol[c.start]
)
)
return c
def make_first_sets(self, c, name):
rawfirst = self.first[name]
first = set()
for label in sorted(rawfirst):
ilabel = self.make_label(c, label)
##assert ilabel not in first # XXX failed on <> ... !=
first.add(ilabel)
return first
def make_label(self, c, label):
label = Label(label)
ilabel = len(c.labels)
if label.type == LabelType.NONTERMINAL:
if label in c.symbol2label:
return c.symbol2label[label]
else:
c.labels.append((c.symbol2number[label], None))
c.symbol2label[label] = ilabel
return ilabel
elif label.type == LabelType.NAMED_TOKEN:
# A named token (NAME, NUMBER, STRING)
itoken = self.tokens.get(label, None)
assert isinstance(itoken, int), label
assert itoken in self.tokens.values(), label
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
elif label.type == LabelType.KEYWORD:
# A keyword
value = literal_eval(label)
if value in c.keywords:
return c.keywords[value]
else:
c.labels.append((self.tokens["NAME"], value))
c.keywords[value] = ilabel
return ilabel
elif label.type == LabelType.OPERATOR:
# An operator (any non-numeric token)
value = literal_eval(label)
tok_name = self.opmap[value] # Fails if unknown token
itoken = self.tokens[tok_name]
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
else:
raise ValueError("Cannot categorize label {}".format(label))
def calculate_first_sets(self):
names = list(self.dfas.keys())
for name in names:
if name not in self.first:
self.calculate_first_sets_for_rule(name)
if self.verbose:
print("First set for {dfa_name}".format(dfa_name=name))
for item in self.first[name]:
print(" - {terminal}".format(terminal=item))
def calculate_first_sets_for_rule(self, name):
dfa = self.dfas[name]
self.first[name] = None # dummy to detect left recursion
state = dfa.states[0]
totalset = set()
overlapcheck = {}
for label, next in state.arcs.items():
if label in self.dfas:
if label in self.first:
fset = self.first[label]
if fset is None:
raise ValueError("recursion for rule %r" % name)
else:
self.calculate_first_sets_for_rule(label)
fset = self.first[label]
totalset.update(fset)
overlapcheck[label] = fset
else:
totalset.add(label)
overlapcheck[label] = {label}
inverse = {}
for label, itsfirst in overlapcheck.items():
for symbol in itsfirst:
if symbol in inverse:
raise ValueError(
"rule %s is ambiguous; %s is in the"
" first sets of %s as well as %s"
% (name, symbol, label, inverse[symbol])
)
inverse[symbol] = label
self.first[name] = totalset
|