diff options
author | Guido van Rossum <guido@python.org> | 2000-04-10 17:10:48 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2000-04-10 17:10:48 (GMT) |
commit | b81e70ebdb28246e427249d386518bc03d08c959 (patch) | |
tree | 4f2ba435b4815d7ff7f4f6abab7505fb16f4c7c7 | |
parent | 5de435a245fd7158b1a8db1201154ad73fd4bf13 (diff) | |
download | cpython-b81e70ebdb28246e427249d386518bc03d08c959.zip cpython-b81e70ebdb28246e427249d386518bc03d08c959.tar.gz cpython-b81e70ebdb28246e427249d386518bc03d08c959.tar.bz2 |
Fredrik Lundh: new snapshot. Mostly reindented.
This one should work with unicode expressions, and compile
a bit more silently.
-rw-r--r-- | Lib/sre_compile.py | 250 | ||||
-rw-r--r-- | Lib/sre_constants.py | 2 | ||||
-rw-r--r-- | Lib/sre_parse.py | 723 |
3 files changed, 490 insertions, 485 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 600b237..8738061 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -26,7 +26,7 @@ from sre_constants import * # find an array type code that matches the engine's code size for WORDSIZE in "BHil": if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): - break + break else: raise RuntimeError, "cannot find a useable array type" @@ -34,18 +34,18 @@ else: class Code: def __init__(self): - self.data = [] + self.data = [] def __len__(self): - return len(self.data) + return len(self.data) def __getitem__(self, index): - return self.data[index] + return self.data[index] def __setitem__(self, index, code): - self.data[index] = code + self.data[index] = code def append(self, code): - self.data.append(code) + self.data.append(code) def todata(self): - # print self.data - return array.array(WORDSIZE, self.data).tostring() + # print self.data + return array.array(WORDSIZE, self.data).tostring() def _lower(literal): # return _sre._lower(literal) # FIXME @@ -54,122 +54,122 @@ def _lower(literal): def _compile(code, pattern, flags): append = code.append for op, av in pattern: - if op is ANY: - if "s" in flags: - append(CODES[op]) # any character at all! - else: - append(CODES[NOT_LITERAL]) - append(10) - elif op in (SUCCESS, FAILURE): - append(CODES[op]) - elif op is AT: - append(CODES[op]) - append(POSITIONS[av]) - elif op is BRANCH: - append(CODES[op]) - tail = [] - for av in av[1]: - skip = len(code); append(0) - _compile(code, av, flags) - append(CODES[JUMP]) - tail.append(len(code)); append(0) - code[skip] = len(code) - skip - append(0) # end of branch - for tail in tail: - code[tail] = len(code) - tail - elif op is CALL: - append(CODES[op]) - skip = len(code); append(0) - _compile(code, av, flags) - append(CODES[SUCCESS]) - code[skip] = len(code) - skip - elif op is CATEGORY: # not used by current parser - append(CODES[op]) - append(CATEGORIES[av]) - elif op is GROUP: - if "i" in flags: - append(CODES[MAP_IGNORE[op]]) - else: - append(CODES[op]) - append(av) - elif op is IN: - if "i" in flags: - append(CODES[MAP_IGNORE[op]]) - def fixup(literal): - return ord(_lower(literal)) - else: - append(CODES[op]) - fixup = ord - skip = len(code); append(0) - for op, av in av: - append(CODES[op]) - if op is NEGATE: - pass - elif op is LITERAL: - append(fixup(av)) - elif op is RANGE: - append(fixup(av[0])) - append(fixup(av[1])) - elif op is CATEGORY: - append(CATEGORIES[av]) - else: - raise ValueError, "unsupported set operator" - append(CODES[FAILURE]) - code[skip] = len(code) - skip - elif op in (LITERAL, NOT_LITERAL): - if "i" in flags: - append(CODES[MAP_IGNORE[op]]) - append(ord(_lower(av))) - else: - append(CODES[op]) - append(ord(av)) - elif op is MARK: - append(CODES[op]) - append(av) - elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): - lo, hi = av[2].getwidth() - if lo == 0: - raise SyntaxError, "cannot repeat zero-width items" - if lo == hi == 1 and op is MAX_REPEAT: - append(CODES[MAX_REPEAT_ONE]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) - _compile(code, av[2], flags) - append(CODES[SUCCESS]) - code[skip] = len(code) - skip - else: - append(CODES[op]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) - _compile(code, av[2], flags) - if op is MIN_REPEAT: - append(CODES[MIN_UNTIL]) - else: - # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?) - append(CODES[MAX_UNTIL]) - code[skip] = len(code) - skip - elif op is SUBPATTERN: -## group = av[0] -## if group: -## append(CODES[MARK]) -## append((group-1)*2) - _compile(code, av[1], flags) -## if group: -## append(CODES[MARK]) -## append((group-1)*2+1) - else: - raise ValueError, ("unsupported operand type", op) + if op is ANY: + if "s" in flags: + append(CODES[op]) # any character at all! + else: + append(CODES[NOT_LITERAL]) + append(10) + elif op in (SUCCESS, FAILURE): + append(CODES[op]) + elif op is AT: + append(CODES[op]) + append(POSITIONS[av]) + elif op is BRANCH: + append(CODES[op]) + tail = [] + for av in av[1]: + skip = len(code); append(0) + _compile(code, av, flags) + append(CODES[JUMP]) + tail.append(len(code)); append(0) + code[skip] = len(code) - skip + append(0) # end of branch + for tail in tail: + code[tail] = len(code) - tail + elif op is CALL: + append(CODES[op]) + skip = len(code); append(0) + _compile(code, av, flags) + append(CODES[SUCCESS]) + code[skip] = len(code) - skip + elif op is CATEGORY: # not used by current parser + append(CODES[op]) + append(CATEGORIES[av]) + elif op is GROUP: + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + else: + append(CODES[op]) + append(av) + elif op is IN: + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + def fixup(literal): + return ord(_lower(literal)) + else: + append(CODES[op]) + fixup = ord + skip = len(code); append(0) + for op, av in av: + append(CODES[op]) + if op is NEGATE: + pass + elif op is LITERAL: + append(fixup(av)) + elif op is RANGE: + append(fixup(av[0])) + append(fixup(av[1])) + elif op is CATEGORY: + append(CATEGORIES[av]) + else: + raise ValueError, "unsupported set operator" + append(CODES[FAILURE]) + code[skip] = len(code) - skip + elif op in (LITERAL, NOT_LITERAL): + if "i" in flags: + append(CODES[MAP_IGNORE[op]]) + append(ord(_lower(av))) + else: + append(CODES[op]) + append(ord(av)) + elif op is MARK: + append(CODES[op]) + append(av) + elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): + lo, hi = av[2].getwidth() + if lo == 0: + raise SyntaxError, "cannot repeat zero-width items" + if lo == hi == 1 and op is MAX_REPEAT: + append(CODES[MAX_REPEAT_ONE]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + append(CODES[SUCCESS]) + code[skip] = len(code) - skip + else: + append(CODES[op]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + if op is MIN_REPEAT: + append(CODES[MIN_UNTIL]) + else: + # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?) + append(CODES[MAX_UNTIL]) + code[skip] = len(code) - skip + elif op is SUBPATTERN: +## group = av[0] +## if group: +## append(CODES[MARK]) +## append((group-1)*2) + _compile(code, av[1], flags) +## if group: +## append(CODES[MARK]) +## append((group-1)*2+1) + else: + raise ValueError, ("unsupported operand type", op) def compile(p, flags=()): # convert pattern list to internal format - if type(p) is type(""): - import sre_parse - pattern = p - p = sre_parse.parse(p) + if type(p) in (type(""), type(u"")): + import sre_parse + pattern = p + p = sre_parse.parse(p) else: - pattern = None + pattern = None # print p.getwidth() # print p code = Code() @@ -178,10 +178,10 @@ def compile(p, flags=()): # print list(code.data) data = code.todata() if 0: # debugging - print - print "-" * 68 - import sre_disasm - sre_disasm.disasm(data) - print "-" * 68 + print + print "-" * 68 + import sre_disasm + sre_disasm.disasm(data) + print "-" * 68 # print len(data), p.pattern.groups, len(p.pattern.groupdict) return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index f05c797..af88309 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -126,6 +126,6 @@ if __name__ == "__main__": f = open("sre_constants.h", "w") f.write("/* generated by sre_constants.py */\n") for k, v in items: - f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n") + f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n") f.close() print "done" diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index db4c500..8b68ea1 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -26,8 +26,11 @@ from sre_constants import * SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" -OCTDIGITS = "01234567" -HEXDIGITS = "0123456789abcdefABCDEF" +# FIXME: string in tuple tests may explode with if char is unicode :-( +DIGITS = tuple(string.digits) + +OCTDIGITS = tuple("01234567") +HEXDIGITS = tuple("0123456789abcdefABCDEF") ESCAPES = { "\\a": (LITERAL, chr(7)), @@ -55,168 +58,168 @@ CATEGORIES = { class Pattern: # FIXME: <fl> rename class, and store flags in here too! def __init__(self): - self.flags = [] - self.groups = 1 - self.groupdict = {} + self.flags = [] + self.groups = 1 + self.groupdict = {} def getgroup(self, name=None): - gid = self.groups - self.groups = gid + 1 - if name: - self.groupdict[name] = gid - return gid + gid = self.groups + self.groups = gid + 1 + if name: + self.groupdict[name] = gid + return gid def setflag(self, flag): - if flag not in self.flags: - self.flags.append(flag) + if flag in self.flags: + self.flags.append(flag) class SubPattern: # a subpattern, in intermediate form def __init__(self, pattern, data=None): - self.pattern = pattern - if not data: - data = [] - self.data = data - self.flags = [] - self.width = None + self.pattern = pattern + if not data: + data = [] + self.data = data + self.flags = [] + self.width = None def __repr__(self): - return repr(self.data) + return repr(self.data) def __len__(self): - return len(self.data) + return len(self.data) def __delitem__(self, index): - del self.data[index] + del self.data[index] def __getitem__(self, index): - return self.data[index] + return self.data[index] def __setitem__(self, index, code): - self.data[index] = code + self.data[index] = code def __getslice__(self, start, stop): - return SubPattern(self.pattern, self.data[start:stop]) + return SubPattern(self.pattern, self.data[start:stop]) def insert(self, index, code): - self.data.insert(index, code) + self.data.insert(index, code) def append(self, code): - self.data.append(code) + self.data.append(code) def getwidth(self): - # determine the width (min, max) for this subpattern - if self.width: - return self.width - lo = hi = 0L - for op, av in self.data: - if op is BRANCH: - l = sys.maxint - h = 0 - for av in av[1]: - i, j = av.getwidth() - l = min(l, i) - h = min(h, j) - lo = lo + i - hi = hi + j - elif op is CALL: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is SUBPATTERN: - i, j = av[1].getwidth() - lo = lo + i - hi = hi + j - elif op in (MIN_REPEAT, MAX_REPEAT): - i, j = av[2].getwidth() - lo = lo + i * av[0] - hi = hi + j * av[1] - elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): - lo = lo + 1 - hi = hi + 1 - elif op == SUCCESS: - break - self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) - return self.width + # determine the width (min, max) for this subpattern + if self.width: + return self.width + lo = hi = 0L + for op, av in self.data: + if op is BRANCH: + l = sys.maxint + h = 0 + for av in av[1]: + i, j = av.getwidth() + l = min(l, i) + h = min(h, j) + lo = lo + i + hi = hi + j + elif op is CALL: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[1].getwidth() + lo = lo + i + hi = hi + j + elif op in (MIN_REPEAT, MAX_REPEAT): + i, j = av[2].getwidth() + lo = lo + i * av[0] + hi = hi + j * av[1] + elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): + lo = lo + 1 + hi = hi + 1 + elif op == SUCCESS: + break + self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) + return self.width def set(self, flag): - if not flag in self.flags: - self.flags.append(flag) + if not flag in self.flags: + self.flags.append(flag) def reset(self, flag): - if flag in self.flags: - self.flags.remove(flag) + if flag in self.flags: + self.flags.remove(flag) class Tokenizer: def __init__(self, string): - self.string = list(string) - self.next = self.__next() + self.string = list(string) + self.next = self.__next() def __next(self): - if not self.string: - return None - char = self.string[0] - if char[0] == "\\": - try: - c = self.string[1] - except IndexError: - raise SyntaxError, "bogus escape" - char = char + c - try: - if c == "x": - # hexadecimal constant - for i in xrange(2, sys.maxint): - c = self.string[i] - if c not in HEXDIGITS: - break - char = char + c - elif c in string.digits: - # decimal (or octal) number - for i in xrange(2, sys.maxint): - c = self.string[i] - # FIXME: if larger than current number of - # groups, interpret as an octal number - if c not in string.digits: - break - char = char + c - except IndexError: - pass # use what we've got this far - del self.string[0:len(char)] - return char + if not self.string: + return None + char = self.string[0] + if char[0] == "\\": + try: + c = self.string[1] + except IndexError: + raise SyntaxError, "bogus escape" + char = char + c + try: + if c == "x": + # hexadecimal constant + for i in xrange(2, sys.maxint): + c = self.string[i] + if str(c) not in HEXDIGITS: + break + char = char + c + elif str(c) in DIGITS: + # decimal (or octal) number + for i in xrange(2, sys.maxint): + c = self.string[i] + # FIXME: if larger than current number of + # groups, interpret as an octal number + if str(c) not in DIGITS: + break + char = char + c + except IndexError: + pass # use what we've got this far + del self.string[0:len(char)] + return char def match(self, char): - if char == self.next: - self.next = self.__next() - return 1 - return 0 + if char == self.next: + self.next = self.__next() + return 1 + return 0 def match_set(self, set): - if self.next in set: - self.next = self.__next() - return 1 - return 0 + if self.next and self.next in set: + self.next = self.__next() + return 1 + return 0 def get(self): - this = self.next - self.next = self.__next() - return this + this = self.next + self.next = self.__next() + return this def _fixescape(escape, character_class=0): # convert escape to (type, value) if character_class: - # inside a character class, we'll look in the character - # escapes dictionary first - code = ESCAPES.get(escape) - if code: - return code - code = CATEGORIES.get(escape) + # inside a character class, we'll look in the character + # escapes dictionary first + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) else: - code = CATEGORIES.get(escape) - if code: - return code - code = ESCAPES.get(escape) + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) if code: - return code + return code if not character_class: - try: - group = int(escape[1:]) - # FIXME: only valid if group <= current number of groups - return GROUP, group - except ValueError: - pass + try: + group = int(escape[1:]) + # FIXME: only valid if group <= current number of groups + return GROUP, group + except ValueError: + pass try: - if escape[1:2] == "x": - escape = escape[2:] - return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff) - elif escape[1:2] in string.digits: - return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff) - elif len(escape) == 2: - return LITERAL, escape[1] + if escape[1:2] == "x": + escape = escape[2:] + return LITERAL, chr(int(escape[-2:], 16) & 0xff) + elif str(escape[1:2]) in DIGITS: + return LITERAL, chr(int(escape[1:], 8) & 0xff) + elif len(escape) == 2: + return LITERAL, escape[1] except ValueError: - pass + pass raise SyntaxError, "bogus escape: %s" % repr(escape) def _branch(subpattern, items): @@ -226,35 +229,35 @@ def _branch(subpattern, items): # check if all items share a common prefix while 1: - prefix = None - for item in items: - if not item: - break - if prefix is None: - prefix = item[0] - elif item[0] != prefix: - break - else: - # all subitems start with a common "prefix". - # move it out of the branch - for item in items: - del item[0] - subpattern.append(prefix) - continue # check next one - break + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break # check if the branch can be replaced by a character set for item in items: - if len(item) != 1 or item[0][0] != LITERAL: - break + if len(item) != 1 or item[0][0] != LITERAL: + break else: - # we can store this as a character set instead of a - # branch (FIXME: use a range if possible) - set = [] - for item in items: - set.append(item[0]) - subpattern.append((IN, set)) - return + # we can store this as a character set instead of a + # branch (FIXME: use a range if possible) + set = [] + for item in items: + set.append(item[0]) + subpattern.append((IN, set)) + return subpattern.append((BRANCH, (None, items))) @@ -268,178 +271,180 @@ def _parse(source, pattern, flags=()): while 1: - if source.next in ("|", ")"): - break # end of subpattern - this = source.get() - if this is None: - break # end of pattern - - if this and this[0] not in SPECIAL_CHARS: - subpattern.append((LITERAL, this)) - - elif this == "[": - # character set - set = [] -## if source.match(":"): -## pass # handle character classes - if source.match("^"): - set.append((NEGATE, None)) - # check remaining characters - start = set[:] - while 1: - this = source.get() - if this == "]" and set != start: - break - elif this and this[0] == "\\": - code1 = _fixescape(this, 1) - elif this: - code1 = LITERAL, this - else: - raise SyntaxError, "unexpected end of regular expression" - if source.match("-"): - # potential range - this = source.get() - if this == "]": - set.append(code1) - set.append((LITERAL, "-")) - break - else: - if this[0] == "\\": - code2 = _fixescape(this, 1) - else: - code2 = LITERAL, this - if code1[0] != LITERAL or code2[0] != LITERAL: - raise SyntaxError, "illegal range" - if len(code1[1]) != 1 or len(code2[1]) != 1: - raise SyntaxError, "illegal range" - set.append((RANGE, (code1[1], code2[1]))) - else: - if code1[0] is IN: - code1 = code1[1][0] - set.append(code1) - - # FIXME: <fl> move set optimization to support function - if len(set)==1 and set[0][0] is LITERAL: - subpattern.append(set[0]) # optimization - elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpattern.append((NOT_LITERAL, set[1][1])) # optimization - else: - # FIXME: <fl> add charmap optimization - subpattern.append((IN, set)) - - elif this and this[0] in REPEAT_CHARS: - # repeat previous item - if this == "?": - min, max = 0, 1 - elif this == "*": - min, max = 0, sys.maxint - elif this == "+": - min, max = 1, sys.maxint - elif this == "{": - min, max = 0, sys.maxint - lo = hi = "" - while source.next in string.digits: - lo = lo + source.get() - if source.match(","): - while source.next in string.digits: - hi = hi + source.get() - else: - hi = lo - if not source.match("}"): - raise SyntaxError, "bogus range" - if lo: - min = int(lo) - if hi: - max = int(hi) - # FIXME: <fl> check that hi >= lo! - else: - raise SyntaxError, "not supported" - # figure out which item to repeat - # FIXME: should back up to the right mark, right? - if subpattern: - index = len(subpattern)-1 - while subpattern[index][0] is MARK: - index = index - 1 - item = subpattern[index:index+1] - else: - raise SyntaxError, "nothing to repeat" - if source.match("?"): - subpattern[index] = (MIN_REPEAT, (min, max, item)) - else: - subpattern[index] = (MAX_REPEAT, (min, max, item)) - elif this == ".": - subpattern.append((ANY, None)) - elif this == "(": - group = 1 - name = None - if source.match("?"): - group = 0 - # options - if source.match("P"): - # named group: skip forward to end of name - if source.match("<"): - name = "" - while 1: - char = source.get() - if char in (">", None): - break - name = name + char - group = 1 - elif source.match(":"): - # non-capturing group - group = 2 - elif source.match_set("iI"): - pattern.setflag("i") - elif source.match_set("lL"): - pattern.setflag("l") - elif source.match_set("mM"): - pattern.setflag("m") - elif source.match_set("sS"): - pattern.setflag("s") - elif source.match_set("xX"): - pattern.setflag("x") - if group: - # parse group contents - b = [] - if group == 2: - # anonymous group - group = None - else: - group = pattern.getgroup(name) - if group: - subpattern.append((MARK, (group-1)*2)) - while 1: - p = _parse(source, pattern, flags) - if source.match(")"): - if b: - b.append(p) - _branch(subpattern, b) - else: - subpattern.append((SUBPATTERN, (group, p))) - break - elif source.match("|"): - b.append(p) - else: - raise SyntaxError, "group not properly closed" - if group: - subpattern.append((MARK, (group-1)*2+1)) - else: - # FIXME: should this really be a while loop? - while source.get() not in (")", None): - pass - - elif this == "^": - subpattern.append((AT, AT_BEGINNING)) - - elif this == "$": - subpattern.append((AT, AT_END)) - - elif this and this[0] == "\\": - code =_fixescape(this) - subpattern.append(code) - - else: - raise SyntaxError, "parser error" + if str(source.next) in ("|", ")"): + break # end of subpattern + this = source.get() + if this is None: + break # end of pattern + + if this and this[0] not in SPECIAL_CHARS: + subpattern.append((LITERAL, this)) + + elif this == "[": + # character set + set = [] +## if source.match(":"): +## pass # handle character classes + if source.match("^"): + set.append((NEGATE, None)) + # check remaining characters + start = set[:] + while 1: + this = source.get() + if this == "]" and set != start: + break + elif this and this[0] == "\\": + code1 = _fixescape(this, 1) + elif this: + code1 = LITERAL, this + else: + raise SyntaxError, "unexpected end of regular expression" + if source.match("-"): + # potential range + this = source.get() + if this == "]": + set.append(code1) + set.append((LITERAL, "-")) + break + else: + if this[0] == "\\": + code2 = _fixescape(this, 1) + else: + code2 = LITERAL, this + if code1[0] != LITERAL or code2[0] != LITERAL: + raise SyntaxError, "illegal range" + if len(code1[1]) != 1 or len(code2[1]) != 1: + raise SyntaxError, "illegal range" + set.append((RANGE, (code1[1], code2[1]))) + else: + if code1[0] is IN: + code1 = code1[1][0] + set.append(code1) + + # FIXME: <fl> move set optimization to support function + if len(set)==1 and set[0][0] is LITERAL: + subpattern.append(set[0]) # optimization + elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: + subpattern.append((NOT_LITERAL, set[1][1])) # optimization + else: + # FIXME: <fl> add charmap optimization + subpattern.append((IN, set)) + + elif this and this[0] in REPEAT_CHARS: + # repeat previous item + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, sys.maxint + elif this == "+": + min, max = 1, sys.maxint + elif this == "{": + min, max = 0, sys.maxint + lo = hi = "" + while str(source.next) in DIGITS: + lo = lo + source.get() + if source.match(","): + while str(source.next) in DIGITS: + hi = hi + source.get() + else: + hi = lo + if not source.match("}"): + raise SyntaxError, "bogus range" + if lo: + min = int(lo) + if hi: + max = int(hi) + # FIXME: <fl> check that hi >= lo! + else: + raise SyntaxError, "not supported" + # figure out which item to repeat + # FIXME: should back up to the right mark, right? + if subpattern: + index = len(subpattern)-1 + while subpattern[index][0] is MARK: + index = index - 1 + item = subpattern[index:index+1] + else: + raise SyntaxError, "nothing to repeat" + if source.match("?"): + subpattern[index] = (MIN_REPEAT, (min, max, item)) + else: + subpattern[index] = (MAX_REPEAT, (min, max, item)) + elif this == ".": + subpattern.append((ANY, None)) + elif this == "(": + group = 1 + name = None + if source.match("?"): + group = 0 + # options + if source.match("P"): + # named group: skip forward to end of name + if source.match("<"): + name = "" + while 1: + char = source.get() + if char is None or char == ">": + break + name = name + char + group = 1 + elif source.match(":"): + # non-capturing group + group = 2 + elif source.match_set("iI"): + pattern.setflag("i") + elif source.match_set("lL"): + pattern.setflag("l") + elif source.match_set("mM"): + pattern.setflag("m") + elif source.match_set("sS"): + pattern.setflag("s") + elif source.match_set("xX"): + pattern.setflag("x") + if group: + # parse group contents + b = [] + if group == 2: + # anonymous group + group = None + else: + group = pattern.getgroup(name) + if group: + subpattern.append((MARK, (group-1)*2)) + while 1: + p = _parse(source, pattern, flags) + if source.match(")"): + if b: + b.append(p) + _branch(subpattern, b) + else: + subpattern.append((SUBPATTERN, (group, p))) + break + elif source.match("|"): + b.append(p) + else: + raise SyntaxError, "group not properly closed" + if group: + subpattern.append((MARK, (group-1)*2+1)) + else: + # FIXME: should this really be a while loop? + while 1: + char = source.get() + if char is None or char == ")": + break + + elif this == "^": + subpattern.append((AT, AT_BEGINNING)) + + elif this == "$": + subpattern.append((AT, AT_END)) + + elif this and this[0] == "\\": + code =_fixescape(this) + subpattern.append(code) + + else: + raise SyntaxError, "parser error" return subpattern @@ -448,20 +453,20 @@ def parse(source, flags=()): g = Pattern() b = [] while 1: - p = _parse(s, g, flags) - tail = s.get() - if tail == "|": - b.append(p) - elif tail == ")": - raise SyntaxError, "unbalanced parenthesis" - elif tail is None: - if b: - b.append(p) - p = SubPattern(g) - _branch(p, b) - break - else: - raise SyntaxError, "bogus characters at end of regular expression" + p = _parse(s, g, flags) + tail = s.get() + if tail == "|": + b.append(p) + elif tail == ")": + raise SyntaxError, "unbalanced parenthesis" + elif tail is None: + if b: + b.append(p) + p = SubPattern(g) + _branch(p, b) + break + else: + raise SyntaxError, "bogus characters at end of regular expression" return p if __name__ == "__main__": @@ -469,23 +474,23 @@ if __name__ == "__main__": from testpatterns import PATTERNS a = b = c = 0 for pattern, flags in PATTERNS: - if flags: - continue - print "-"*68 - try: - p = parse(pattern) - print repr(pattern), "->" - pprint(p.data) - import sre_compile - try: - code = sre_compile.compile(p) - c = c + 1 - except: - pass - a = a + 1 - except SyntaxError, v: - print "**", repr(pattern), v - b = b + 1 + if flags: + continue + print "-"*68 + try: + p = parse(pattern) + print repr(pattern), "->" + pprint(p.data) + import sre_compile + try: + code = sre_compile.compile(p) + c = c + 1 + except: + pass + a = a + 1 + except SyntaxError, v: + print "**", repr(pattern), v + b = b + 1 print "-"*68 print a, "of", b, "patterns successfully parsed" print c, "of", b, "patterns successfully compiled" |