diff options
Diffstat (limited to 'Parser')
-rw-r--r-- | Parser/asdl.py | 8 | ||||
-rwxr-xr-x | Parser/asdl_c.py | 100 | ||||
-rw-r--r-- | Parser/spark.py | 1520 |
3 files changed, 814 insertions, 814 deletions
diff --git a/Parser/asdl.py b/Parser/asdl.py index 0db4e3b..5b856dd 100644 --- a/Parser/asdl.py +++ b/Parser/asdl.py @@ -142,7 +142,7 @@ class ASDLParser(spark.GenericParser, object): def p_product(self, (_0, fields, _1)): " product ::= ( fields ) " # XXX can't I just construct things in the right order? - fields.reverse() + fields.reverse() return Product(fields) def p_sum_0(self, (constructor,)): @@ -164,7 +164,7 @@ class ASDLParser(spark.GenericParser, object): def p_constructor_1(self, (id, _0, fields, _1)): " constructor ::= Id ( fields ) " # XXX can't I just construct things in the right order? - fields.reverse() + fields.reverse() return Constructor(id, fields) def p_fields_0(self, (field,)): @@ -355,7 +355,7 @@ def check(mod): v.errors += 1 uses = ", ".join(v.types[t]) print "Undefined type %s, used in %s" % (t, uses) - + return not v.errors def parse(file): @@ -380,7 +380,7 @@ if __name__ == "__main__": else: testdir = "tests" files = glob.glob(testdir + "/*.asdl") - + for file in files: print file mod = parse(file) diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py index b1fc77d..766be67 100755 --- a/Parser/asdl_c.py +++ b/Parser/asdl_c.py @@ -439,7 +439,7 @@ class FreeVisitor(PickleVisitor): self.emit("", 0) self.emit("free(o);", 1) self.func_end() - + def visitConstructor(self, cons, enum, name): self.emit("case %s_kind:" % cons.name, 1) for f in cons.fields: @@ -487,76 +487,76 @@ class FreeVisitor(PickleVisitor): else: ctype = get_c_type(field.type) self.emit("free_%s((%s)%s);" % (field.type, ctype, value), depth) - + class MarshalUtilVisitor(StaticVisitor): CODE = ''' #define CHECKSIZE(BUF, OFF, MIN) { \\ - int need = *(OFF) + MIN; \\ - if (need >= PyString_GET_SIZE(*(BUF))) { \\ - int newsize = PyString_GET_SIZE(*(BUF)) * 2; \\ - if (newsize < need) \\ - newsize = need; \\ - if (_PyString_Resize((BUF), newsize) < 0) \\ - return 0; \\ - } \\ -} - -static int + int need = *(OFF) + MIN; \\ + if (need >= PyString_GET_SIZE(*(BUF))) { \\ + int newsize = PyString_GET_SIZE(*(BUF)) * 2; \\ + if (newsize < need) \\ + newsize = need; \\ + if (_PyString_Resize((BUF), newsize) < 0) \\ + return 0; \\ + } \\ +} + +static int marshal_write_int(PyObject **buf, int *offset, int x) { - char *s; - - CHECKSIZE(buf, offset, 4) - s = PyString_AS_STRING(*buf) + (*offset); - s[0] = (x & 0xff); - s[1] = (x >> 8) & 0xff; - s[2] = (x >> 16) & 0xff; - s[3] = (x >> 24) & 0xff; - *offset += 4; - return 1; + char *s; + + CHECKSIZE(buf, offset, 4) + s = PyString_AS_STRING(*buf) + (*offset); + s[0] = (x & 0xff); + s[1] = (x >> 8) & 0xff; + s[2] = (x >> 16) & 0xff; + s[3] = (x >> 24) & 0xff; + *offset += 4; + return 1; } -static int +static int marshal_write_bool(PyObject **buf, int *offset, bool b) { - if (b) - marshal_write_int(buf, offset, 1); - else - marshal_write_int(buf, offset, 0); - return 1; + if (b) + marshal_write_int(buf, offset, 1); + else + marshal_write_int(buf, offset, 0); + return 1; } -static int +static int marshal_write_identifier(PyObject **buf, int *offset, identifier id) { - int l = PyString_GET_SIZE(id); - marshal_write_int(buf, offset, l); - CHECKSIZE(buf, offset, l); - memcpy(PyString_AS_STRING(*buf) + *offset, - PyString_AS_STRING(id), l); - *offset += l; - return 1; + int l = PyString_GET_SIZE(id); + marshal_write_int(buf, offset, l); + CHECKSIZE(buf, offset, l); + memcpy(PyString_AS_STRING(*buf) + *offset, + PyString_AS_STRING(id), l); + *offset += l; + return 1; } -static int +static int marshal_write_string(PyObject **buf, int *offset, string s) { - int len = PyString_GET_SIZE(s); - marshal_write_int(buf, offset, len); - CHECKSIZE(buf, offset, len); - memcpy(PyString_AS_STRING(*buf) + *offset, - PyString_AS_STRING(s), len); - *offset += len; - return 1; + int len = PyString_GET_SIZE(s); + marshal_write_int(buf, offset, len); + CHECKSIZE(buf, offset, len); + memcpy(PyString_AS_STRING(*buf) + *offset, + PyString_AS_STRING(s), len); + *offset += len; + return 1; } -static int +static int marshal_write_object(PyObject **buf, int *offset, object s) { - /* XXX */ - return 0; + /* XXX */ + return 0; } ''' @@ -575,7 +575,7 @@ class MarshalFunctionVisitor(PickleVisitor): self.emit("return 1;", 1) self.emit("}", 0) self.emit("", 0) - + def visitSum(self, sum, name): self.func_begin(name, has_sequence(sum.types, False)) simple = is_simple(sum) @@ -594,7 +594,7 @@ class MarshalFunctionVisitor(PickleVisitor): for field in prod.fields: self.visitField(field, name, 1, 1) self.func_end() - + def visitConstructor(self, cons, enum, name, simple): if simple: self.emit("case %s:" % cons.name, 1) diff --git a/Parser/spark.py b/Parser/spark.py index 00d9733..2c18623 100644 --- a/Parser/spark.py +++ b/Parser/spark.py @@ -1,5 +1,5 @@ # Copyright (c) 1998-2002 John Aycock -# +# # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including @@ -7,10 +7,10 @@ # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: -# +# # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -26,651 +26,651 @@ import sys import string def _namelist(instance): - namelist, namedict, classlist = [], {}, [instance.__class__] - for c in classlist: - for b in c.__bases__: - classlist.append(b) - for name in c.__dict__.keys(): - if not namedict.has_key(name): - namelist.append(name) - namedict[name] = 1 - return namelist + namelist, namedict, classlist = [], {}, [instance.__class__] + for c in classlist: + for b in c.__bases__: + classlist.append(b) + for name in c.__dict__.keys(): + if not namedict.has_key(name): + namelist.append(name) + namedict[name] = 1 + return namelist class GenericScanner: - def __init__(self, flags=0): - pattern = self.reflect() - self.re = re.compile(pattern, re.VERBOSE|flags) - - self.index2func = {} - for name, number in self.re.groupindex.items(): - self.index2func[number-1] = getattr(self, 't_' + name) - - def makeRE(self, name): - doc = getattr(self, name).__doc__ - rv = '(?P<%s>%s)' % (name[2:], doc) - return rv - - def reflect(self): - rv = [] - for name in _namelist(self): - if name[:2] == 't_' and name != 't_default': - rv.append(self.makeRE(name)) - - rv.append(self.makeRE('t_default')) - return string.join(rv, '|') - - def error(self, s, pos): - print "Lexical error at position %s" % pos - raise SystemExit - - def tokenize(self, s): - pos = 0 - n = len(s) - while pos < n: - m = self.re.match(s, pos) - if m is None: - self.error(s, pos) - - groups = m.groups() - for i in range(len(groups)): - if groups[i] and self.index2func.has_key(i): - self.index2func[i](groups[i]) - pos = m.end() - - def t_default(self, s): - r'( . | \n )+' - print "Specification error: unmatched input" - raise SystemExit + def __init__(self, flags=0): + pattern = self.reflect() + self.re = re.compile(pattern, re.VERBOSE|flags) + + self.index2func = {} + for name, number in self.re.groupindex.items(): + self.index2func[number-1] = getattr(self, 't_' + name) + + def makeRE(self, name): + doc = getattr(self, name).__doc__ + rv = '(?P<%s>%s)' % (name[2:], doc) + return rv + + def reflect(self): + rv = [] + for name in _namelist(self): + if name[:2] == 't_' and name != 't_default': + rv.append(self.makeRE(name)) + + rv.append(self.makeRE('t_default')) + return string.join(rv, '|') + + def error(self, s, pos): + print "Lexical error at position %s" % pos + raise SystemExit + + def tokenize(self, s): + pos = 0 + n = len(s) + while pos < n: + m = self.re.match(s, pos) + if m is None: + self.error(s, pos) + + groups = m.groups() + for i in range(len(groups)): + if groups[i] and self.index2func.has_key(i): + self.index2func[i](groups[i]) + pos = m.end() + + def t_default(self, s): + r'( . | \n )+' + print "Specification error: unmatched input" + raise SystemExit # # Extracted from GenericParser and made global so that [un]picking works. # class _State: - def __init__(self, stateno, items): - self.T, self.complete, self.items = [], [], items - self.stateno = stateno + def __init__(self, stateno, items): + self.T, self.complete, self.items = [], [], items + self.stateno = stateno class GenericParser: - # - # An Earley parser, as per J. Earley, "An Efficient Context-Free - # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, - # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, - # Carnegie-Mellon University, August 1968. New formulation of - # the parser according to J. Aycock, "Practical Earley Parsing - # and the SPARK Toolkit", Ph.D. thesis, University of Victoria, - # 2001, and J. Aycock and R. N. Horspool, "Practical Earley - # Parsing", unpublished paper, 2001. - # - - def __init__(self, start): - self.rules = {} - self.rule2func = {} - self.rule2name = {} - self.collectRules() - self.augment(start) - self.ruleschanged = 1 - - _NULLABLE = '\e_' - _START = 'START' - _BOF = '|-' - - # - # When pickling, take the time to generate the full state machine; - # some information is then extraneous, too. Unfortunately we - # can't save the rule2func map. - # - def __getstate__(self): - if self.ruleschanged: - # - # XXX - duplicated from parse() - # - self.computeNull() - self.newrules = {} - self.new2old = {} - self.makeNewRules() - self.ruleschanged = 0 - self.edges, self.cores = {}, {} - self.states = { 0: self.makeState0() } - self.makeState(0, self._BOF) - # - # XXX - should find a better way to do this.. - # - changes = 1 - while changes: - changes = 0 - for k, v in self.edges.items(): - if v is None: - state, sym = k - if self.states.has_key(state): - self.goto(state, sym) - changes = 1 - rv = self.__dict__.copy() - for s in self.states.values(): - del s.items - del rv['rule2func'] - del rv['nullable'] - del rv['cores'] - return rv - - def __setstate__(self, D): - self.rules = {} - self.rule2func = {} - self.rule2name = {} - self.collectRules() - start = D['rules'][self._START][0][1][1] # Blech. - self.augment(start) - D['rule2func'] = self.rule2func - D['makeSet'] = self.makeSet_fast - self.__dict__ = D - - # - # A hook for GenericASTBuilder and GenericASTMatcher. Mess - # thee not with this; nor shall thee toucheth the _preprocess - # argument to addRule. - # - def preprocess(self, rule, func): return rule, func - - def addRule(self, doc, func, _preprocess=1): - fn = func - rules = string.split(doc) - - index = [] - for i in range(len(rules)): - if rules[i] == '::=': - index.append(i-1) - index.append(len(rules)) - - for i in range(len(index)-1): - lhs = rules[index[i]] - rhs = rules[index[i]+2:index[i+1]] - rule = (lhs, tuple(rhs)) - - if _preprocess: - rule, fn = self.preprocess(rule, func) - - if self.rules.has_key(lhs): - self.rules[lhs].append(rule) - else: - self.rules[lhs] = [ rule ] - self.rule2func[rule] = fn - self.rule2name[rule] = func.__name__[2:] - self.ruleschanged = 1 - - def collectRules(self): - for name in _namelist(self): - if name[:2] == 'p_': - func = getattr(self, name) - doc = func.__doc__ - self.addRule(doc, func) - - def augment(self, start): - rule = '%s ::= %s %s' % (self._START, self._BOF, start) - self.addRule(rule, lambda args: args[1], 0) - - def computeNull(self): - self.nullable = {} - tbd = [] - - for rulelist in self.rules.values(): - lhs = rulelist[0][0] - self.nullable[lhs] = 0 - for rule in rulelist: - rhs = rule[1] - if len(rhs) == 0: - self.nullable[lhs] = 1 - continue - # - # We only need to consider rules which - # consist entirely of nonterminal symbols. - # This should be a savings on typical - # grammars. - # - for sym in rhs: - if not self.rules.has_key(sym): - break - else: - tbd.append(rule) - changes = 1 - while changes: - changes = 0 - for lhs, rhs in tbd: - if self.nullable[lhs]: - continue - for sym in rhs: - if not self.nullable[sym]: - break - else: - self.nullable[lhs] = 1 - changes = 1 - - def makeState0(self): - s0 = _State(0, []) - for rule in self.newrules[self._START]: - s0.items.append((rule, 0)) - return s0 - - def finalState(self, tokens): - # - # Yuck. - # - if len(self.newrules[self._START]) == 2 and len(tokens) == 0: - return 1 - start = self.rules[self._START][0][1][1] - return self.goto(1, start) - - def makeNewRules(self): - worklist = [] - for rulelist in self.rules.values(): - for rule in rulelist: - worklist.append((rule, 0, 1, rule)) - - for rule, i, candidate, oldrule in worklist: - lhs, rhs = rule - n = len(rhs) - while i < n: - sym = rhs[i] - if not self.rules.has_key(sym) or \ - not self.nullable[sym]: - candidate = 0 - i = i + 1 - continue - - newrhs = list(rhs) - newrhs[i] = self._NULLABLE+sym - newrule = (lhs, tuple(newrhs)) - worklist.append((newrule, i+1, - candidate, oldrule)) - candidate = 0 - i = i + 1 - else: - if candidate: - lhs = self._NULLABLE+lhs - rule = (lhs, rhs) - if self.newrules.has_key(lhs): - self.newrules[lhs].append(rule) - else: - self.newrules[lhs] = [ rule ] - self.new2old[rule] = oldrule - - def typestring(self, token): - return None - - def error(self, token): - print "Syntax error at or near `%s' token" % token - raise SystemExit - - def parse(self, tokens): - sets = [ [(1,0), (2,0)] ] - self.links = {} - - if self.ruleschanged: - self.computeNull() - self.newrules = {} - self.new2old = {} - self.makeNewRules() - self.ruleschanged = 0 - self.edges, self.cores = {}, {} - self.states = { 0: self.makeState0() } - self.makeState(0, self._BOF) - - for i in xrange(len(tokens)): - sets.append([]) - - if sets[i] == []: - break - self.makeSet(tokens[i], sets, i) - else: - sets.append([]) - self.makeSet(None, sets, len(tokens)) - - #_dump(tokens, sets, self.states) - - finalitem = (self.finalState(tokens), 0) - if finalitem not in sets[-2]: - if len(tokens) > 0: - self.error(tokens[i-1]) - else: - self.error(None) - - return self.buildTree(self._START, finalitem, - tokens, len(sets)-2) - - def isnullable(self, sym): - # - # For symbols in G_e only. If we weren't supporting 1.5, - # could just use sym.startswith(). - # - return self._NULLABLE == sym[0:len(self._NULLABLE)] - - def skip(self, (lhs, rhs), pos=0): - n = len(rhs) - while pos < n: - if not self.isnullable(rhs[pos]): - break - pos = pos + 1 - return pos - - def makeState(self, state, sym): - assert sym is not None - # - # Compute \epsilon-kernel state's core and see if - # it exists already. - # - kitems = [] - for rule, pos in self.states[state].items: - lhs, rhs = rule - if rhs[pos:pos+1] == (sym,): - kitems.append((rule, self.skip(rule, pos+1))) - core = kitems - - core.sort() - tcore = tuple(core) - if self.cores.has_key(tcore): - return self.cores[tcore] - # - # Nope, doesn't exist. Compute it and the associated - # \epsilon-nonkernel state together; we'll need it right away. - # - k = self.cores[tcore] = len(self.states) - K, NK = _State(k, kitems), _State(k+1, []) - self.states[k] = K - predicted = {} - - edges = self.edges - rules = self.newrules - for X in K, NK: - worklist = X.items - for item in worklist: - rule, pos = item - lhs, rhs = rule - if pos == len(rhs): - X.complete.append(rule) - continue - - nextSym = rhs[pos] - key = (X.stateno, nextSym) - if not rules.has_key(nextSym): - if not edges.has_key(key): - edges[key] = None - X.T.append(nextSym) - else: - edges[key] = None - if not predicted.has_key(nextSym): - predicted[nextSym] = 1 - for prule in rules[nextSym]: - ppos = self.skip(prule) - new = (prule, ppos) - NK.items.append(new) - # - # Problem: we know K needs generating, but we - # don't yet know about NK. Can't commit anything - # regarding NK to self.edges until we're sure. Should - # we delay committing on both K and NK to avoid this - # hacky code? This creates other problems.. - # - if X is K: - edges = {} - - if NK.items == []: - return k - - # - # Check for \epsilon-nonkernel's core. Unfortunately we - # need to know the entire set of predicted nonterminals - # to do this without accidentally duplicating states. - # - core = predicted.keys() - core.sort() - tcore = tuple(core) - if self.cores.has_key(tcore): - self.edges[(k, None)] = self.cores[tcore] - return k - - nk = self.cores[tcore] = self.edges[(k, None)] = NK.stateno - self.edges.update(edges) - self.states[nk] = NK - return k - - def goto(self, state, sym): - key = (state, sym) - if not self.edges.has_key(key): - # - # No transitions from state on sym. - # - return None - - rv = self.edges[key] - if rv is None: - # - # Target state isn't generated yet. Remedy this. - # - rv = self.makeState(state, sym) - self.edges[key] = rv - return rv - - def gotoT(self, state, t): - return [self.goto(state, t)] - - def gotoST(self, state, st): - rv = [] - for t in self.states[state].T: - if st == t: - rv.append(self.goto(state, t)) - return rv - - def add(self, set, item, i=None, predecessor=None, causal=None): - if predecessor is None: - if item not in set: - set.append(item) - else: - key = (item, i) - if item not in set: - self.links[key] = [] - set.append(item) - self.links[key].append((predecessor, causal)) - - def makeSet(self, token, sets, i): - cur, next = sets[i], sets[i+1] - - ttype = token is not None and self.typestring(token) or None - if ttype is not None: - fn, arg = self.gotoT, ttype - else: - fn, arg = self.gotoST, token - - for item in cur: - ptr = (item, i) - state, parent = item - add = fn(state, arg) - for k in add: - if k is not None: - self.add(next, (k, parent), i+1, ptr) - nk = self.goto(k, None) - if nk is not None: - self.add(next, (nk, i+1)) - - if parent == i: - continue - - for rule in self.states[state].complete: - lhs, rhs = rule - for pitem in sets[parent]: - pstate, pparent = pitem - k = self.goto(pstate, lhs) - if k is not None: - why = (item, i, rule) - pptr = (pitem, parent) - self.add(cur, (k, pparent), - i, pptr, why) - nk = self.goto(k, None) - if nk is not None: - self.add(cur, (nk, i)) - - def makeSet_fast(self, token, sets, i): - # - # Call *only* when the entire state machine has been built! - # It relies on self.edges being filled in completely, and - # then duplicates and inlines code to boost speed at the - # cost of extreme ugliness. - # - cur, next = sets[i], sets[i+1] - ttype = token is not None and self.typestring(token) or None - - for item in cur: - ptr = (item, i) - state, parent = item - if ttype is not None: - k = self.edges.get((state, ttype), None) - if k is not None: - #self.add(next, (k, parent), i+1, ptr) - #INLINED --v - new = (k, parent) - key = (new, i+1) - if new not in next: - self.links[key] = [] - next.append(new) - self.links[key].append((ptr, None)) - #INLINED --^ - #nk = self.goto(k, None) - nk = self.edges.get((k, None), None) - if nk is not None: - #self.add(next, (nk, i+1)) - #INLINED --v - new = (nk, i+1) - if new not in next: - next.append(new) - #INLINED --^ - else: - add = self.gotoST(state, token) - for k in add: - if k is not None: - self.add(next, (k, parent), i+1, ptr) - #nk = self.goto(k, None) - nk = self.edges.get((k, None), None) - if nk is not None: - self.add(next, (nk, i+1)) - - if parent == i: - continue - - for rule in self.states[state].complete: - lhs, rhs = rule - for pitem in sets[parent]: - pstate, pparent = pitem - #k = self.goto(pstate, lhs) - k = self.edges.get((pstate, lhs), None) - if k is not None: - why = (item, i, rule) - pptr = (pitem, parent) - #self.add(cur, (k, pparent), - # i, pptr, why) - #INLINED --v - new = (k, pparent) - key = (new, i) - if new not in cur: - self.links[key] = [] - cur.append(new) - self.links[key].append((pptr, why)) - #INLINED --^ - #nk = self.goto(k, None) - nk = self.edges.get((k, None), None) - if nk is not None: - #self.add(cur, (nk, i)) - #INLINED --v - new = (nk, i) - if new not in cur: - cur.append(new) - #INLINED --^ - - def predecessor(self, key, causal): - for p, c in self.links[key]: - if c == causal: - return p - assert 0 - - def causal(self, key): - links = self.links[key] - if len(links) == 1: - return links[0][1] - choices = [] - rule2cause = {} - for p, c in links: - rule = c[2] - choices.append(rule) - rule2cause[rule] = c - return rule2cause[self.ambiguity(choices)] - - def deriveEpsilon(self, nt): - if len(self.newrules[nt]) > 1: - rule = self.ambiguity(self.newrules[nt]) - else: - rule = self.newrules[nt][0] - #print rule - - rhs = rule[1] - attr = [None] * len(rhs) - - for i in range(len(rhs)-1, -1, -1): - attr[i] = self.deriveEpsilon(rhs[i]) - return self.rule2func[self.new2old[rule]](attr) - - def buildTree(self, nt, item, tokens, k): - state, parent = item - - choices = [] - for rule in self.states[state].complete: - if rule[0] == nt: - choices.append(rule) - rule = choices[0] - if len(choices) > 1: - rule = self.ambiguity(choices) - #print rule - - rhs = rule[1] - attr = [None] * len(rhs) - - for i in range(len(rhs)-1, -1, -1): - sym = rhs[i] - if not self.newrules.has_key(sym): - if sym != self._BOF: - attr[i] = tokens[k-1] - key = (item, k) - item, k = self.predecessor(key, None) - #elif self.isnullable(sym): - elif self._NULLABLE == sym[0:len(self._NULLABLE)]: - attr[i] = self.deriveEpsilon(sym) - else: - key = (item, k) - why = self.causal(key) - attr[i] = self.buildTree(sym, why[0], - tokens, why[1]) - item, k = self.predecessor(key, why) - return self.rule2func[self.new2old[rule]](attr) - - def ambiguity(self, rules): - # - # XXX - problem here and in collectRules() if the same rule - # appears in >1 method. Also undefined results if rules - # causing the ambiguity appear in the same method. - # - sortlist = [] - name2index = {} - for i in range(len(rules)): - lhs, rhs = rule = rules[i] - name = self.rule2name[self.new2old[rule]] - sortlist.append((len(rhs), name)) - name2index[name] = i - sortlist.sort() - list = map(lambda (a,b): b, sortlist) - return rules[name2index[self.resolve(list)]] - - def resolve(self, list): - # - # Resolve ambiguity in favor of the shortest RHS. - # Since we walk the tree from the top down, this - # should effectively resolve in favor of a "shift". - # - return list[0] + # + # An Earley parser, as per J. Earley, "An Efficient Context-Free + # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, + # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, + # Carnegie-Mellon University, August 1968. New formulation of + # the parser according to J. Aycock, "Practical Earley Parsing + # and the SPARK Toolkit", Ph.D. thesis, University of Victoria, + # 2001, and J. Aycock and R. N. Horspool, "Practical Earley + # Parsing", unpublished paper, 2001. + # + + def __init__(self, start): + self.rules = {} + self.rule2func = {} + self.rule2name = {} + self.collectRules() + self.augment(start) + self.ruleschanged = 1 + + _NULLABLE = '\e_' + _START = 'START' + _BOF = '|-' + + # + # When pickling, take the time to generate the full state machine; + # some information is then extraneous, too. Unfortunately we + # can't save the rule2func map. + # + def __getstate__(self): + if self.ruleschanged: + # + # XXX - duplicated from parse() + # + self.computeNull() + self.newrules = {} + self.new2old = {} + self.makeNewRules() + self.ruleschanged = 0 + self.edges, self.cores = {}, {} + self.states = { 0: self.makeState0() } + self.makeState(0, self._BOF) + # + # XXX - should find a better way to do this.. + # + changes = 1 + while changes: + changes = 0 + for k, v in self.edges.items(): + if v is None: + state, sym = k + if self.states.has_key(state): + self.goto(state, sym) + changes = 1 + rv = self.__dict__.copy() + for s in self.states.values(): + del s.items + del rv['rule2func'] + del rv['nullable'] + del rv['cores'] + return rv + + def __setstate__(self, D): + self.rules = {} + self.rule2func = {} + self.rule2name = {} + self.collectRules() + start = D['rules'][self._START][0][1][1] # Blech. + self.augment(start) + D['rule2func'] = self.rule2func + D['makeSet'] = self.makeSet_fast + self.__dict__ = D + + # + # A hook for GenericASTBuilder and GenericASTMatcher. Mess + # thee not with this; nor shall thee toucheth the _preprocess + # argument to addRule. + # + def preprocess(self, rule, func): return rule, func + + def addRule(self, doc, func, _preprocess=1): + fn = func + rules = string.split(doc) + + index = [] + for i in range(len(rules)): + if rules[i] == '::=': + index.append(i-1) + index.append(len(rules)) + + for i in range(len(index)-1): + lhs = rules[index[i]] + rhs = rules[index[i]+2:index[i+1]] + rule = (lhs, tuple(rhs)) + + if _preprocess: + rule, fn = self.preprocess(rule, func) + + if self.rules.has_key(lhs): + self.rules[lhs].append(rule) + else: + self.rules[lhs] = [ rule ] + self.rule2func[rule] = fn + self.rule2name[rule] = func.__name__[2:] + self.ruleschanged = 1 + + def collectRules(self): + for name in _namelist(self): + if name[:2] == 'p_': + func = getattr(self, name) + doc = func.__doc__ + self.addRule(doc, func) + + def augment(self, start): + rule = '%s ::= %s %s' % (self._START, self._BOF, start) + self.addRule(rule, lambda args: args[1], 0) + + def computeNull(self): + self.nullable = {} + tbd = [] + + for rulelist in self.rules.values(): + lhs = rulelist[0][0] + self.nullable[lhs] = 0 + for rule in rulelist: + rhs = rule[1] + if len(rhs) == 0: + self.nullable[lhs] = 1 + continue + # + # We only need to consider rules which + # consist entirely of nonterminal symbols. + # This should be a savings on typical + # grammars. + # + for sym in rhs: + if not self.rules.has_key(sym): + break + else: + tbd.append(rule) + changes = 1 + while changes: + changes = 0 + for lhs, rhs in tbd: + if self.nullable[lhs]: + continue + for sym in rhs: + if not self.nullable[sym]: + break + else: + self.nullable[lhs] = 1 + changes = 1 + + def makeState0(self): + s0 = _State(0, []) + for rule in self.newrules[self._START]: + s0.items.append((rule, 0)) + return s0 + + def finalState(self, tokens): + # + # Yuck. + # + if len(self.newrules[self._START]) == 2 and len(tokens) == 0: + return 1 + start = self.rules[self._START][0][1][1] + return self.goto(1, start) + + def makeNewRules(self): + worklist = [] + for rulelist in self.rules.values(): + for rule in rulelist: + worklist.append((rule, 0, 1, rule)) + + for rule, i, candidate, oldrule in worklist: + lhs, rhs = rule + n = len(rhs) + while i < n: + sym = rhs[i] + if not self.rules.has_key(sym) or \ + not self.nullable[sym]: + candidate = 0 + i = i + 1 + continue + + newrhs = list(rhs) + newrhs[i] = self._NULLABLE+sym + newrule = (lhs, tuple(newrhs)) + worklist.append((newrule, i+1, + candidate, oldrule)) + candidate = 0 + i = i + 1 + else: + if candidate: + lhs = self._NULLABLE+lhs + rule = (lhs, rhs) + if self.newrules.has_key(lhs): + self.newrules[lhs].append(rule) + else: + self.newrules[lhs] = [ rule ] + self.new2old[rule] = oldrule + + def typestring(self, token): + return None + + def error(self, token): + print "Syntax error at or near `%s' token" % token + raise SystemExit + + def parse(self, tokens): + sets = [ [(1,0), (2,0)] ] + self.links = {} + + if self.ruleschanged: + self.computeNull() + self.newrules = {} + self.new2old = {} + self.makeNewRules() + self.ruleschanged = 0 + self.edges, self.cores = {}, {} + self.states = { 0: self.makeState0() } + self.makeState(0, self._BOF) + + for i in xrange(len(tokens)): + sets.append([]) + + if sets[i] == []: + break + self.makeSet(tokens[i], sets, i) + else: + sets.append([]) + self.makeSet(None, sets, len(tokens)) + + #_dump(tokens, sets, self.states) + + finalitem = (self.finalState(tokens), 0) + if finalitem not in sets[-2]: + if len(tokens) > 0: + self.error(tokens[i-1]) + else: + self.error(None) + + return self.buildTree(self._START, finalitem, + tokens, len(sets)-2) + + def isnullable(self, sym): + # + # For symbols in G_e only. If we weren't supporting 1.5, + # could just use sym.startswith(). + # + return self._NULLABLE == sym[0:len(self._NULLABLE)] + + def skip(self, (lhs, rhs), pos=0): + n = len(rhs) + while pos < n: + if not self.isnullable(rhs[pos]): + break + pos = pos + 1 + return pos + + def makeState(self, state, sym): + assert sym is not None + # + # Compute \epsilon-kernel state's core and see if + # it exists already. + # + kitems = [] + for rule, pos in self.states[state].items: + lhs, rhs = rule + if rhs[pos:pos+1] == (sym,): + kitems.append((rule, self.skip(rule, pos+1))) + core = kitems + + core.sort() + tcore = tuple(core) + if self.cores.has_key(tcore): + return self.cores[tcore] + # + # Nope, doesn't exist. Compute it and the associated + # \epsilon-nonkernel state together; we'll need it right away. + # + k = self.cores[tcore] = len(self.states) + K, NK = _State(k, kitems), _State(k+1, []) + self.states[k] = K + predicted = {} + + edges = self.edges + rules = self.newrules + for X in K, NK: + worklist = X.items + for item in worklist: + rule, pos = item + lhs, rhs = rule + if pos == len(rhs): + X.complete.append(rule) + continue + + nextSym = rhs[pos] + key = (X.stateno, nextSym) + if not rules.has_key(nextSym): + if not edges.has_key(key): + edges[key] = None + X.T.append(nextSym) + else: + edges[key] = None + if not predicted.has_key(nextSym): + predicted[nextSym] = 1 + for prule in rules[nextSym]: + ppos = self.skip(prule) + new = (prule, ppos) + NK.items.append(new) + # + # Problem: we know K needs generating, but we + # don't yet know about NK. Can't commit anything + # regarding NK to self.edges until we're sure. Should + # we delay committing on both K and NK to avoid this + # hacky code? This creates other problems.. + # + if X is K: + edges = {} + + if NK.items == []: + return k + + # + # Check for \epsilon-nonkernel's core. Unfortunately we + # need to know the entire set of predicted nonterminals + # to do this without accidentally duplicating states. + # + core = predicted.keys() + core.sort() + tcore = tuple(core) + if self.cores.has_key(tcore): + self.edges[(k, None)] = self.cores[tcore] + return k + + nk = self.cores[tcore] = self.edges[(k, None)] = NK.stateno + self.edges.update(edges) + self.states[nk] = NK + return k + + def goto(self, state, sym): + key = (state, sym) + if not self.edges.has_key(key): + # + # No transitions from state on sym. + # + return None + + rv = self.edges[key] + if rv is None: + # + # Target state isn't generated yet. Remedy this. + # + rv = self.makeState(state, sym) + self.edges[key] = rv + return rv + + def gotoT(self, state, t): + return [self.goto(state, t)] + + def gotoST(self, state, st): + rv = [] + for t in self.states[state].T: + if st == t: + rv.append(self.goto(state, t)) + return rv + + def add(self, set, item, i=None, predecessor=None, causal=None): + if predecessor is None: + if item not in set: + set.append(item) + else: + key = (item, i) + if item not in set: + self.links[key] = [] + set.append(item) + self.links[key].append((predecessor, causal)) + + def makeSet(self, token, sets, i): + cur, next = sets[i], sets[i+1] + + ttype = token is not None and self.typestring(token) or None + if ttype is not None: + fn, arg = self.gotoT, ttype + else: + fn, arg = self.gotoST, token + + for item in cur: + ptr = (item, i) + state, parent = item + add = fn(state, arg) + for k in add: + if k is not None: + self.add(next, (k, parent), i+1, ptr) + nk = self.goto(k, None) + if nk is not None: + self.add(next, (nk, i+1)) + + if parent == i: + continue + + for rule in self.states[state].complete: + lhs, rhs = rule + for pitem in sets[parent]: + pstate, pparent = pitem + k = self.goto(pstate, lhs) + if k is not None: + why = (item, i, rule) + pptr = (pitem, parent) + self.add(cur, (k, pparent), + i, pptr, why) + nk = self.goto(k, None) + if nk is not None: + self.add(cur, (nk, i)) + + def makeSet_fast(self, token, sets, i): + # + # Call *only* when the entire state machine has been built! + # It relies on self.edges being filled in completely, and + # then duplicates and inlines code to boost speed at the + # cost of extreme ugliness. + # + cur, next = sets[i], sets[i+1] + ttype = token is not None and self.typestring(token) or None + + for item in cur: + ptr = (item, i) + state, parent = item + if ttype is not None: + k = self.edges.get((state, ttype), None) + if k is not None: + #self.add(next, (k, parent), i+1, ptr) + #INLINED --v + new = (k, parent) + key = (new, i+1) + if new not in next: + self.links[key] = [] + next.append(new) + self.links[key].append((ptr, None)) + #INLINED --^ + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + #self.add(next, (nk, i+1)) + #INLINED --v + new = (nk, i+1) + if new not in next: + next.append(new) + #INLINED --^ + else: + add = self.gotoST(state, token) + for k in add: + if k is not None: + self.add(next, (k, parent), i+1, ptr) + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + self.add(next, (nk, i+1)) + + if parent == i: + continue + + for rule in self.states[state].complete: + lhs, rhs = rule + for pitem in sets[parent]: + pstate, pparent = pitem + #k = self.goto(pstate, lhs) + k = self.edges.get((pstate, lhs), None) + if k is not None: + why = (item, i, rule) + pptr = (pitem, parent) + #self.add(cur, (k, pparent), + # i, pptr, why) + #INLINED --v + new = (k, pparent) + key = (new, i) + if new not in cur: + self.links[key] = [] + cur.append(new) + self.links[key].append((pptr, why)) + #INLINED --^ + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + #self.add(cur, (nk, i)) + #INLINED --v + new = (nk, i) + if new not in cur: + cur.append(new) + #INLINED --^ + + def predecessor(self, key, causal): + for p, c in self.links[key]: + if c == causal: + return p + assert 0 + + def causal(self, key): + links = self.links[key] + if len(links) == 1: + return links[0][1] + choices = [] + rule2cause = {} + for p, c in links: + rule = c[2] + choices.append(rule) + rule2cause[rule] = c + return rule2cause[self.ambiguity(choices)] + + def deriveEpsilon(self, nt): + if len(self.newrules[nt]) > 1: + rule = self.ambiguity(self.newrules[nt]) + else: + rule = self.newrules[nt][0] + #print rule + + rhs = rule[1] + attr = [None] * len(rhs) + + for i in range(len(rhs)-1, -1, -1): + attr[i] = self.deriveEpsilon(rhs[i]) + return self.rule2func[self.new2old[rule]](attr) + + def buildTree(self, nt, item, tokens, k): + state, parent = item + + choices = [] + for rule in self.states[state].complete: + if rule[0] == nt: + choices.append(rule) + rule = choices[0] + if len(choices) > 1: + rule = self.ambiguity(choices) + #print rule + + rhs = rule[1] + attr = [None] * len(rhs) + + for i in range(len(rhs)-1, -1, -1): + sym = rhs[i] + if not self.newrules.has_key(sym): + if sym != self._BOF: + attr[i] = tokens[k-1] + key = (item, k) + item, k = self.predecessor(key, None) + #elif self.isnullable(sym): + elif self._NULLABLE == sym[0:len(self._NULLABLE)]: + attr[i] = self.deriveEpsilon(sym) + else: + key = (item, k) + why = self.causal(key) + attr[i] = self.buildTree(sym, why[0], + tokens, why[1]) + item, k = self.predecessor(key, why) + return self.rule2func[self.new2old[rule]](attr) + + def ambiguity(self, rules): + # + # XXX - problem here and in collectRules() if the same rule + # appears in >1 method. Also undefined results if rules + # causing the ambiguity appear in the same method. + # + sortlist = [] + name2index = {} + for i in range(len(rules)): + lhs, rhs = rule = rules[i] + name = self.rule2name[self.new2old[rule]] + sortlist.append((len(rhs), name)) + name2index[name] = i + sortlist.sort() + list = map(lambda (a,b): b, sortlist) + return rules[name2index[self.resolve(list)]] + + def resolve(self, list): + # + # Resolve ambiguity in favor of the shortest RHS. + # Since we walk the tree from the top down, this + # should effectively resolve in favor of a "shift". + # + return list[0] # # GenericASTBuilder automagically constructs a concrete/abstract syntax tree @@ -681,32 +681,32 @@ class GenericParser: # class GenericASTBuilder(GenericParser): - def __init__(self, AST, start): - GenericParser.__init__(self, start) - self.AST = AST - - def preprocess(self, rule, func): - rebind = lambda lhs, self=self: \ - lambda args, lhs=lhs, self=self: \ - self.buildASTNode(args, lhs) - lhs, rhs = rule - return rule, rebind(lhs) - - def buildASTNode(self, args, lhs): - children = [] - for arg in args: - if isinstance(arg, self.AST): - children.append(arg) - else: - children.append(self.terminal(arg)) - return self.nonterminal(lhs, children) - - def terminal(self, token): return token - - def nonterminal(self, type, args): - rv = self.AST(type) - rv[:len(args)] = args - return rv + def __init__(self, AST, start): + GenericParser.__init__(self, start) + self.AST = AST + + def preprocess(self, rule, func): + rebind = lambda lhs, self=self: \ + lambda args, lhs=lhs, self=self: \ + self.buildASTNode(args, lhs) + lhs, rhs = rule + return rule, rebind(lhs) + + def buildASTNode(self, args, lhs): + children = [] + for arg in args: + if isinstance(arg, self.AST): + children.append(arg) + else: + children.append(self.terminal(arg)) + return self.nonterminal(lhs, children) + + def terminal(self, token): return token + + def nonterminal(self, type, args): + rv = self.AST(type) + rv[:len(args)] = args + return rv # # GenericASTTraversal is a Visitor pattern according to Design Patterns. For @@ -719,57 +719,57 @@ class GenericASTBuilder(GenericParser): # class GenericASTTraversalPruningException: - pass + pass class GenericASTTraversal: - def __init__(self, ast): - self.ast = ast + def __init__(self, ast): + self.ast = ast - def typestring(self, node): - return node.type + def typestring(self, node): + return node.type - def prune(self): - raise GenericASTTraversalPruningException + def prune(self): + raise GenericASTTraversalPruningException - def preorder(self, node=None): - if node is None: - node = self.ast + def preorder(self, node=None): + if node is None: + node = self.ast - try: - name = 'n_' + self.typestring(node) - if hasattr(self, name): - func = getattr(self, name) - func(node) - else: - self.default(node) - except GenericASTTraversalPruningException: - return + try: + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) + except GenericASTTraversalPruningException: + return - for kid in node: - self.preorder(kid) + for kid in node: + self.preorder(kid) - name = name + '_exit' - if hasattr(self, name): - func = getattr(self, name) - func(node) + name = name + '_exit' + if hasattr(self, name): + func = getattr(self, name) + func(node) - def postorder(self, node=None): - if node is None: - node = self.ast + def postorder(self, node=None): + if node is None: + node = self.ast - for kid in node: - self.postorder(kid) + for kid in node: + self.postorder(kid) - name = 'n_' + self.typestring(node) - if hasattr(self, name): - func = getattr(self, name) - func(node) - else: - self.default(node) + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) - def default(self, node): - pass + def default(self, node): + pass # # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" @@ -779,62 +779,62 @@ class GenericASTTraversal: # class GenericASTMatcher(GenericParser): - def __init__(self, start, ast): - GenericParser.__init__(self, start) - self.ast = ast - - def preprocess(self, rule, func): - rebind = lambda func, self=self: \ - lambda args, func=func, self=self: \ - self.foundMatch(args, func) - lhs, rhs = rule - rhslist = list(rhs) - rhslist.reverse() - - return (lhs, tuple(rhslist)), rebind(func) - - def foundMatch(self, args, func): - func(args[-1]) - return args[-1] - - def match_r(self, node): - self.input.insert(0, node) - children = 0 - - for child in node: - if children == 0: - self.input.insert(0, '(') - children = children + 1 - self.match_r(child) - - if children > 0: - self.input.insert(0, ')') - - def match(self, ast=None): - if ast is None: - ast = self.ast - self.input = [] - - self.match_r(ast) - self.parse(self.input) - - def resolve(self, list): - # - # Resolve ambiguity in favor of the longest RHS. - # - return list[-1] + def __init__(self, start, ast): + GenericParser.__init__(self, start) + self.ast = ast + + def preprocess(self, rule, func): + rebind = lambda func, self=self: \ + lambda args, func=func, self=self: \ + self.foundMatch(args, func) + lhs, rhs = rule + rhslist = list(rhs) + rhslist.reverse() + + return (lhs, tuple(rhslist)), rebind(func) + + def foundMatch(self, args, func): + func(args[-1]) + return args[-1] + + def match_r(self, node): + self.input.insert(0, node) + children = 0 + + for child in node: + if children == 0: + self.input.insert(0, '(') + children = children + 1 + self.match_r(child) + + if children > 0: + self.input.insert(0, ')') + + def match(self, ast=None): + if ast is None: + ast = self.ast + self.input = [] + + self.match_r(ast) + self.parse(self.input) + + def resolve(self, list): + # + # Resolve ambiguity in favor of the longest RHS. + # + return list[-1] def _dump(tokens, sets, states): - for i in range(len(sets)): - print 'set', i - for item in sets[i]: - print '\t', item - for (lhs, rhs), pos in states[item[0]].items: - print '\t\t', lhs, '::=', - print string.join(rhs[:pos]), - print '.', - print string.join(rhs[pos:]) - if i < len(tokens): - print - print 'token', str(tokens[i]) - print + for i in range(len(sets)): + print 'set', i + for item in sets[i]: + print '\t', item + for (lhs, rhs), pos in states[item[0]].items: + print '\t\t', lhs, '::=', + print string.join(rhs[:pos]), + print '.', + print string.join(rhs[pos:]) + if i < len(tokens): + print + print 'token', str(tokens[i]) + print |