diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-06-29 08:58:44 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-06-29 08:58:44 (GMT) |
commit | 436c3d58a2570f3b599e59b4071f944f774ec441 (patch) | |
tree | 0e2e1634bbeae51a1322d616943acc71211b475d /Lib | |
parent | 102f3ad676be1ef18fd9185b0dfb6c2796a6f8ac (diff) | |
download | cpython-436c3d58a2570f3b599e59b4071f944f774ec441.zip cpython-436c3d58a2570f3b599e59b4071f944f774ec441.tar.gz cpython-436c3d58a2570f3b599e59b4071f944f774ec441.tar.bz2 |
towards 1.6b1
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sre.py | 23 | ||||
-rw-r--r-- | Lib/sre_compile.py | 66 | ||||
-rw-r--r-- | Lib/sre_constants.py | 72 | ||||
-rw-r--r-- | Lib/sre_parse.py | 113 |
4 files changed, 184 insertions, 90 deletions
@@ -12,6 +12,7 @@ # import sre_compile +import sre_parse # flags I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE @@ -20,6 +21,13 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE S = DOTALL = sre_compile.SRE_FLAG_DOTALL X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE +# sre extensions (may or may not be in 1.6 final) +T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE +U = UNICODE = sre_compile.SRE_FLAG_UNICODE + +# sre exception +error = sre_parse.error + # -------------------------------------------------------------------- # public interface @@ -46,6 +54,9 @@ def findall(pattern, string, maxsplit=0): def compile(pattern, flags=0): return _compile(pattern, flags) +def template(pattern, flags=0): + return _compile(pattern, flags|T) + def escape(pattern): s = list(pattern) for i in range(len(pattern)): @@ -83,18 +94,14 @@ def _sub(pattern, template, string, count=0): # internal: pattern.sub implementation hook return _subn(pattern, template, string, count)[0] -def _expand(match, template): - # internal: expand template - return template # FIXME - def _subn(pattern, template, string, count=0): # internal: pattern.subn implementation hook if callable(template): filter = template else: - # FIXME: prepare template + template = sre_parse.parse_template(template, pattern) def filter(match, template=template): - return _expand(match, template) + return sre_parse.expand_template(template, match) n = i = 0 s = [] append = s.append @@ -108,6 +115,8 @@ def _subn(pattern, template, string, count=0): append(string[i:j]) append(filter(m)) i = m.end() + if i <= j: + break n = n + 1 if i < len(string): append(string[i:]) @@ -126,6 +135,8 @@ def _split(pattern, string, maxsplit=0): j = m.start() append(string[i:j]) i = m.end() + if i <= j: + break n = n + 1 if i < len(string): append(string[i:]) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 53da005..aeafe9d 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -48,7 +48,7 @@ class Code: print self.data raise -def _compile(code, pattern, flags, level=0): +def _compile(code, pattern, flags): append = code.append for op, av in pattern: if op is ANY: @@ -70,23 +70,26 @@ def _compile(code, pattern, flags, level=0): tail = [] for av in av[1]: skip = len(code); append(0) - _compile(code, av, flags, level) - append(OPCODES[JUMP]) - tail.append(len(code)); append(0) + _compile(code, av, flags) +## append(OPCODES[SUCCESS]) + append(OPCODES[JUMP]) + tail.append(len(code)); append(0) code[skip] = len(code) - skip append(0) # end of branch - for tail in tail: + for tail in tail: code[tail] = len(code) - tail elif op is CALL: append(OPCODES[op]) skip = len(code); append(0) - _compile(code, av, flags, level+1) + _compile(code, av, flags) append(OPCODES[SUCCESS]) code[skip] = len(code) - skip - elif op is CATEGORY: # not used by current parser + elif op is CATEGORY: append(OPCODES[op]) if flags & SRE_FLAG_LOCALE: append(CH_LOCALE[CHCODES[av]]) + elif flags & SRE_FLAG_UNICODE: + append(CH_UNICODE[CHCODES[av]]) else: append(CHCODES[av]) elif op is GROUP: @@ -98,8 +101,8 @@ def _compile(code, pattern, flags, level=0): elif op is IN: if flags & SRE_FLAG_IGNORECASE: append(OPCODES[OP_IGNORE[op]]) - def fixup(literal): - return ord(literal.lower()) + def fixup(literal, flags=flags): + return _sre.getlower(ord(literal), flags) else: append(OPCODES[op]) fixup = ord @@ -116,6 +119,8 @@ def _compile(code, pattern, flags, level=0): elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: append(CH_LOCALE[CHCODES[av]]) + elif flags & SRE_FLAG_UNICODE: + append(CH_UNICODE[CHCODES[av]]) else: append(CHCODES[av]) else: @@ -125,42 +130,49 @@ def _compile(code, pattern, flags, level=0): elif op in (LITERAL, NOT_LITERAL): if flags & SRE_FLAG_IGNORECASE: append(OPCODES[OP_IGNORE[op]]) - append(ord(av.lower())) else: append(OPCODES[op]) - append(ord(av)) + append(ord(av)) elif op is MARK: append(OPCODES[op]) append(av) elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): - lo, hi = av[2].getwidth() - if lo == 0: - raise SyntaxError, "cannot repeat zero-width items" - if lo == hi == 1 and op is MAX_REPEAT: - append(OPCODES[MAX_REPEAT_ONE]) + if flags & SRE_FLAG_TEMPLATE: + append(OPCODES[REPEAT]) skip = len(code); append(0) append(av[0]) append(av[1]) - _compile(code, av[2], flags, level+1) + _compile(code, av[2], flags) append(OPCODES[SUCCESS]) code[skip] = len(code) - skip else: - append(OPCODES[op]) - skip = len(code); append(0) - append(av[0]) - append(av[1]) - _compile(code, av[2], flags, level+1) - if op is MIN_REPEAT: - append(OPCODES[MIN_UNTIL]) + lo, hi = av[2].getwidth() + if lo == 0: + raise error, "nothing to repeat" + if 0 and lo == hi == 1 and op is MAX_REPEAT: + # FIXME: <fl> need a better way to figure out when + # it's safe to use this one (in the parser, probably) + append(OPCODES[MAX_REPEAT_ONE]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + append(OPCODES[SUCCESS]) + code[skip] = len(code) - skip else: - append(OPCODES[MAX_UNTIL]) - code[skip] = len(code) - skip + append(OPCODES[op]) + skip = len(code); append(0) + append(av[0]) + append(av[1]) + _compile(code, av[2], flags) + append(OPCODES[SUCCESS]) + code[skip] = len(code) - skip elif op is SUBPATTERN: group = av[0] if group: append(OPCODES[MARK]) append((group-1)*2) - _compile(code, av[1], flags, level+1) + _compile(code, av[1], flags) if group: append(OPCODES[MARK]) append((group-1)*2+1) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 531dc31..c996960 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -15,6 +15,11 @@ # other compatibility work. # +# should this really be here? + +class error(Exception): + pass + # operators FAILURE = "failure" @@ -30,20 +35,20 @@ GROUP = "group" GROUP_IGNORE = "group_ignore" IN = "in" IN_IGNORE = "in_ignore" +INFO = "info" JUMP = "jump" LITERAL = "literal" LITERAL_IGNORE = "literal_ignore" MARK = "mark" MAX_REPEAT = "max_repeat" MAX_REPEAT_ONE = "max_repeat_one" -MAX_UNTIL = "max_until" MIN_REPEAT = "min_repeat" -MIN_UNTIL = "min_until" NEGATE = "negate" NOT_LITERAL = "not_literal" NOT_LITERAL_IGNORE = "not_literal_ignore" RANGE = "range" REPEAT = "repeat" +REPEAT_ONE = "repeat_one" SUBPATTERN = "subpattern" # positions @@ -63,14 +68,16 @@ CATEGORY_WORD = "category_word" CATEGORY_NOT_WORD = "category_not_word" CATEGORY_LINEBREAK = "category_linebreak" CATEGORY_NOT_LINEBREAK = "category_not_linebreak" -CATEGORY_LOC_DIGIT = "category_loc_digit" -CATEGORY_LOC_NOT_DIGIT = "category_loc_not_digit" -CATEGORY_LOC_SPACE = "category_loc_space" -CATEGORY_LOC_NOT_SPACE = "category_loc_not_space" CATEGORY_LOC_WORD = "category_loc_word" CATEGORY_LOC_NOT_WORD = "category_loc_not_word" -CATEGORY_LOC_LINEBREAK = "category_loc_linebreak" -CATEGORY_LOC_NOT_LINEBREAK = "category_loc_not_linebreak" +CATEGORY_UNI_DIGIT = "category_uni_digit" +CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit" +CATEGORY_UNI_SPACE = "category_uni_space" +CATEGORY_UNI_NOT_SPACE = "category_uni_not_space" +CATEGORY_UNI_WORD = "category_uni_word" +CATEGORY_UNI_NOT_WORD = "category_uni_not_word" +CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" +CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" OPCODES = [ @@ -85,12 +92,13 @@ OPCODES = [ CATEGORY, GROUP, GROUP_IGNORE, IN, IN_IGNORE, + INFO, JUMP, LITERAL, LITERAL_IGNORE, MARK, - MAX_REPEAT, MAX_UNTIL, + MAX_REPEAT, MAX_REPEAT_ONE, - MIN_REPEAT, MIN_UNTIL, + MIN_REPEAT, NOT_LITERAL, NOT_LITERAL_IGNORE, NEGATE, RANGE, @@ -106,10 +114,11 @@ ATCODES = [ CHCODES = [ CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, - CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_DIGIT, - CATEGORY_LOC_NOT_DIGIT, CATEGORY_LOC_SPACE, - CATEGORY_LOC_NOT_SPACE, CATEGORY_LOC_WORD, CATEGORY_LOC_NOT_WORD, - CATEGORY_LOC_LINEBREAK, CATEGORY_LOC_NOT_LINEBREAK + CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD, + CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT, + CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD, + CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK, + CATEGORY_UNI_NOT_LINEBREAK ] def makedict(list): @@ -138,23 +147,35 @@ AT_MULTILINE = { } CH_LOCALE = { - CATEGORY_DIGIT: CATEGORY_LOC_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_LOC_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_LOC_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_LOC_NOT_SPACE, + CATEGORY_DIGIT: CATEGORY_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, CATEGORY_WORD: CATEGORY_LOC_WORD, CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_LOC_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_LOC_NOT_LINEBREAK + CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK +} + +CH_UNICODE = { + CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_UNI_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, + CATEGORY_WORD: CATEGORY_UNI_WORD, + CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK } # flags -SRE_FLAG_TEMPLATE = 1 # NYI +SRE_FLAG_TEMPLATE = 1 SRE_FLAG_IGNORECASE = 2 SRE_FLAG_LOCALE = 4 SRE_FLAG_MULTILINE = 8 SRE_FLAG_DOTALL = 16 -SRE_FLAG_VERBOSE = 32 +SRE_FLAG_UNICODE = 32 +SRE_FLAG_VERBOSE = 64 if __name__ == "__main__": import string @@ -168,5 +189,12 @@ if __name__ == "__main__": dump(f, OPCODES, "SRE_OP") dump(f, ATCODES, "SRE") dump(f, CHCODES, "SRE") + f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) + f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) + f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) + f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE) + f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) + f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) + f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) f.close() print "done" diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 8e6705c..af6c6e1 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -20,14 +20,15 @@ import _sre from sre_constants import * -# FIXME: should be 65535, but the array module currently chokes on -# unsigned integers larger than 32767... +# FIXME: <fl> should be 65535, but the array module currently chokes +# on unsigned integers larger than 32767 [fixed in 1.6b1?] MAXREPEAT = int(2L**(_sre.getcodesize()*8-1))-1 SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" -# FIXME: string in tuple tests may explode with if char is unicode :-( +# FIXME: <fl> string in tuple tests may explode with if char is +# unicode [fixed in 1.6b1?] DIGITS = tuple(string.digits) OCTDIGITS = tuple("01234567") @@ -59,12 +60,15 @@ CATEGORIES = { } FLAGS = { + # standard flags "i": SRE_FLAG_IGNORECASE, "L": SRE_FLAG_LOCALE, "m": SRE_FLAG_MULTILINE, "s": SRE_FLAG_DOTALL, - "t": SRE_FLAG_TEMPLATE, "x": SRE_FLAG_VERBOSE, + # extensions + "t": SRE_FLAG_TEMPLATE, + "u": SRE_FLAG_UNICODE, } class State: @@ -151,7 +155,7 @@ class Tokenizer: try: c = self.string[self.index + 1] except IndexError: - raise SyntaxError, "bogus escape" + raise error, "bogus escape" char = char + c self.index = self.index + len(char) return char @@ -205,7 +209,7 @@ def _class_escape(source, escape): return LITERAL, escape[1] except ValueError: pass - raise SyntaxError, "bogus escape: %s" % repr(escape) + raise error, "bogus escape: %s" % repr(escape) def _escape(source, escape, state): # handle escape code in expression @@ -241,13 +245,12 @@ def _escape(source, escape, state): return LITERAL, escape[1] except ValueError: pass - raise SyntaxError, "bogus escape: %s" % repr(escape) + raise error, "bogus escape: %s" % repr(escape) def _branch(pattern, items): - # form a branch operator from a set of items (FIXME: move this - # optimization to the compiler module!) + # form a branch operator from a set of items subpattern = SubPattern(pattern) @@ -332,7 +335,7 @@ def _parse(source, state, flags=0): elif this: code1 = LITERAL, this else: - raise SyntaxError, "unexpected end of regular expression" + raise error, "unexpected end of regular expression" if source.match("-"): # potential range this = source.get() @@ -346,9 +349,9 @@ def _parse(source, state, flags=0): else: code2 = LITERAL, this if code1[0] != LITERAL or code2[0] != LITERAL: - raise SyntaxError, "illegal range" + raise error, "illegal range" if len(code1[1]) != 1 or len(code2[1]) != 1: - raise SyntaxError, "illegal range" + raise error, "illegal range" set.append((RANGE, (code1[1], code2[1]))) else: if code1[0] is IN: @@ -383,19 +386,19 @@ def _parse(source, state, flags=0): else: hi = lo if not source.match("}"): - raise SyntaxError, "bogus range" + raise error, "bogus range" if lo: min = int(lo) if hi: max = int(hi) # FIXME: <fl> check that hi >= lo! else: - raise SyntaxError, "not supported" + raise error, "not supported" # figure out which item to repeat if subpattern: item = subpattern[-1:] else: - raise SyntaxError, "nothing to repeat" + raise error, "nothing to repeat" if source.match("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: @@ -418,7 +421,7 @@ def _parse(source, state, flags=0): while 1: char = source.get() if char is None: - raise SyntaxError, "unterminated name" + raise error, "unterminated name" if char == ">": break # FIXME: check for valid character @@ -426,22 +429,21 @@ def _parse(source, state, flags=0): group = 1 elif source.match("="): # named backreference - raise SyntaxError, "not yet implemented" - + raise error, "not yet implemented" else: char = source.get() if char is None: - raise SyntaxError, "unexpected end of pattern" - raise SyntaxError, "unknown specifier: ?P%s" % char + raise error, "unexpected end of pattern" + raise error, "unknown specifier: ?P%s" % char elif source.match(":"): # non-capturing group group = 2 elif source.match("#"): # comment while 1: - char = source.get() - if char is None or char == ")": + if source.next is None or source.next == ")": break + source.get() else: # flags while FLAGS.has_key(source.next): @@ -465,13 +467,13 @@ def _parse(source, state, flags=0): elif source.match("|"): b.append(p) else: - raise SyntaxError, "group not properly closed" + raise error, "group not properly closed" else: while 1: char = source.get() if char is None or char == ")": break - # FIXME: skip characters? + raise error, "unknown extension" elif this == "^": subpattern.append((AT, AT_BEGINNING)) @@ -484,7 +486,7 @@ def _parse(source, state, flags=0): subpattern.append(code) else: - raise SyntaxError, "parser error" + raise error, "parser error" return subpattern @@ -499,17 +501,17 @@ def parse(pattern, flags=0): if tail == "|": b.append(p) elif tail == ")": - raise SyntaxError, "unbalanced parenthesis" + raise error, "unbalanced parenthesis" elif tail is None: if b: b.append(p) p = _branch(state, b) break else: - raise SyntaxError, "bogus characters at end of regular expression" + raise error, "bogus characters at end of regular expression" return p -def parse_replacement(source, pattern): +def parse_template(source, pattern): # parse 're' replacement string into list of literals and # group references s = Tokenizer(source) @@ -520,15 +522,56 @@ def parse_replacement(source, pattern): if this is None: break # end of replacement string if this and this[0] == "\\": - try: - a(LITERAL, ESCAPES[this]) - except KeyError: - for char in this: - a(LITERAL, char) + if this == "\\g": + name = "" + if s.match("<"): + while 1: + char = s.get() + if char is None: + raise error, "unterminated index" + if char == ">": + break + # FIXME: check for valid character + name = name + char + if not name: + raise error, "bad index" + try: + index = int(name) + except ValueError: + try: + index = pattern.groupindex[name] + except KeyError: + raise IndexError, "unknown index" + a((MARK, index)) + elif len(this) > 1 and this[1] in DIGITS: + while s.next in DIGITS: + this = this + s.get() + a((MARK, int(this[1:]))) + else: + try: + a(ESCAPES[this]) + except KeyError: + for char in this: + a((LITERAL, char)) else: - a(LITERAL, this) + a((LITERAL, this)) return p +def expand_template(template, match): + # FIXME: <fl> this is sooooo slow. drop in the slicelist + # code instead + p = [] + a = p.append + for c, s in template: + if c is LITERAL: + a(s) + elif c is MARK: + s = match.group(s) + if s is None: + raise error, "empty group" + a(s) + return match.string[:0].join(p) + if __name__ == "__main__": from pprint import pprint from testpatterns import PATTERNS @@ -548,7 +591,7 @@ if __name__ == "__main__": except: pass a = a + 1 - except SyntaxError, v: + except error, v: print "**", repr(pattern), v b = b + 1 print "-"*68 |