1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
#!/usr/bin/env python3.8
import argparse
import pprint
import sys
from typing import Dict, Set
from pegen.build import build_parser
from pegen.grammar import (
Alt,
Cut,
Gather,
GrammarVisitor,
Group,
Lookahead,
NamedItem,
NameLeaf,
NegativeLookahead,
Opt,
Repeat0,
Repeat1,
Rhs,
Rule,
StringLeaf,
)
from pegen.parser_generator import compute_nullables
argparser = argparse.ArgumentParser(
prog="calculate_first_sets",
description="Calculate the first sets of a grammar",
)
argparser.add_argument("grammar_file", help="The grammar file")
class FirstSetCalculator(GrammarVisitor):
def __init__(self, rules: Dict[str, Rule]) -> None:
self.rules = rules
self.nullables = compute_nullables(rules)
self.first_sets: Dict[str, Set[str]] = dict()
self.in_process: Set[str] = set()
def calculate(self) -> Dict[str, Set[str]]:
for name, rule in self.rules.items():
self.visit(rule)
return self.first_sets
def visit_Alt(self, item: Alt) -> Set[str]:
result: Set[str] = set()
to_remove: Set[str] = set()
for other in item.items:
new_terminals = self.visit(other)
if isinstance(other.item, NegativeLookahead):
to_remove |= new_terminals
result |= new_terminals
if to_remove:
result -= to_remove
# If the set of new terminals can start with the empty string,
# it means that the item is completelly nullable and we should
# also considering at least the next item in case the current
# one fails to parse.
if "" in new_terminals:
continue
if not isinstance(other.item, (Opt, NegativeLookahead, Repeat0)):
break
# Do not allow the empty string to propagate.
result.discard("")
return result
def visit_Cut(self, item: Cut) -> Set[str]:
return set()
def visit_Group(self, item: Group) -> Set[str]:
return self.visit(item.rhs)
def visit_PositiveLookahead(self, item: Lookahead) -> Set[str]:
return self.visit(item.node)
def visit_NegativeLookahead(self, item: NegativeLookahead) -> Set[str]:
return self.visit(item.node)
def visit_NamedItem(self, item: NamedItem) -> Set[str]:
return self.visit(item.item)
def visit_Opt(self, item: Opt) -> Set[str]:
return self.visit(item.node)
def visit_Gather(self, item: Gather) -> Set[str]:
return self.visit(item.node)
def visit_Repeat0(self, item: Repeat0) -> Set[str]:
return self.visit(item.node)
def visit_Repeat1(self, item: Repeat1) -> Set[str]:
return self.visit(item.node)
def visit_NameLeaf(self, item: NameLeaf) -> Set[str]:
if item.value not in self.rules:
return {item.value}
if item.value not in self.first_sets:
self.first_sets[item.value] = self.visit(self.rules[item.value])
return self.first_sets[item.value]
elif item.value in self.in_process:
return set()
return self.first_sets[item.value]
def visit_StringLeaf(self, item: StringLeaf) -> Set[str]:
return {item.value}
def visit_Rhs(self, item: Rhs) -> Set[str]:
result: Set[str] = set()
for alt in item.alts:
result |= self.visit(alt)
return result
def visit_Rule(self, item: Rule) -> Set[str]:
if item.name in self.in_process:
return set()
elif item.name not in self.first_sets:
self.in_process.add(item.name)
terminals = self.visit(item.rhs)
if item in self.nullables:
terminals.add("")
self.first_sets[item.name] = terminals
self.in_process.remove(item.name)
return self.first_sets[item.name]
def main() -> None:
args = argparser.parse_args()
try:
grammar, parser, tokenizer = build_parser(args.grammar_file)
except Exception as err:
print("ERROR: Failed to parse grammar file", file=sys.stderr)
sys.exit(1)
firs_sets = FirstSetCalculator(grammar.rules).calculate()
pprint.pprint(firs_sets)
if __name__ == "__main__":
main()
|