diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-06-30 13:55:15 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-06-30 13:55:15 (GMT) |
commit | 0640e1161f37fd3415e9efdbde1e293efb98978c (patch) | |
tree | c008ad1c1f5e8610921bb893df78fa86f06fa2cf /Lib | |
parent | ae1b5b2e985eeea6433671d874ccaddfd36db5a6 (diff) | |
download | cpython-0640e1161f37fd3415e9efdbde1e293efb98978c.zip cpython-0640e1161f37fd3415e9efdbde1e293efb98978c.tar.gz cpython-0640e1161f37fd3415e9efdbde1e293efb98978c.tar.bz2 |
the mad patcher strikes again:
-- added pickling support (only works if sre is imported)
-- fixed wordsize problems in engine
(instead of casting literals down to the character size,
cast characters up to the literal size (same as the code
word size). this prevents false hits when you're matching
a unicode pattern against an 8-bit string. (unfortunately,
this broke another test, but I think the test should be
changed in this case; more on that on python-dev)
-- added sre.purge function
(unofficial, clears the cache)
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sre.py | 13 | ||||
-rw-r--r-- | Lib/sre_compile.py | 8 | ||||
-rw-r--r-- | Lib/sre_parse.py | 66 | ||||
-rw-r--r-- | Lib/test/output/test_sre | 3 |
4 files changed, 49 insertions, 41 deletions
@@ -89,6 +89,10 @@ def _compile(pattern, flags=0): _cache[key] = p return p +def purge(): + # clear pattern cache + _cache.clear() + def _sub(pattern, template, string, count=0): # internal: pattern.sub implementation hook return _subn(pattern, template, string, count)[0] @@ -142,3 +146,12 @@ def _split(pattern, string, maxsplit=0): n = n + 1 append(string[i:]) return s + +# register myself for pickling + +import copy_reg + +def _pickle(p): + return _compile, (p.pattern, p.flags) + +copy_reg.pickle(type(_compile("")), _pickle, _compile) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 0829c00..e48a7eb 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -31,15 +31,15 @@ def _compile(code, pattern, flags): emit(OPCODES[OP_IGNORE[op]]) else: emit(OPCODES[op]) - emit(ord(av)) + emit(av) elif op is IN: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) def fixup(literal, flags=flags): - return _sre.getlower(ord(literal), flags) + return _sre.getlower(literal, flags) else: emit(OPCODES[op]) - fixup = ord + fixup = lambda x: x skip = len(code); emit(0) for op, av in av: emit(OPCODES[op]) @@ -165,7 +165,7 @@ def _compile_info(code, pattern, flags): if not (flags & SRE_FLAG_IGNORECASE): for op, av in pattern.data: if op is LITERAL: - prefix.append(ord(av)) + prefix.append(av) else: break # add an info block diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index d3dbe00..fb954e9 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -19,6 +19,9 @@ from sre_constants import * # FIXME: should be 65535, but the arraymodule is still broken MAXREPEAT = 32767 +# FIXME: same here +CHARMASK = 0x7fff + SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -30,14 +33,14 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF") WHITESPACE = string.whitespace ESCAPES = { - r"\a": (LITERAL, chr(7)), - r"\b": (LITERAL, chr(8)), - r"\f": (LITERAL, chr(12)), - r"\n": (LITERAL, chr(10)), - r"\r": (LITERAL, chr(13)), - r"\t": (LITERAL, chr(9)), - r"\v": (LITERAL, chr(11)), - r"\\": (LITERAL, "\\") + r"\a": (LITERAL, 7), + r"\b": (LITERAL, 8), + r"\f": (LITERAL, 12), + r"\n": (LITERAL, 10), + r"\r": (LITERAL, 13), + r"\t": (LITERAL, 9), + r"\v": (LITERAL, 11), + r"\\": (LITERAL, ord("\\")) } CATEGORIES = { @@ -176,9 +179,6 @@ def isdigit(char): def isname(name): # check that group name is a valid string - # FIXME: <fl> this code is really lame. should use a regular - # expression instead, but I seem to have certain bootstrapping - # problems here ;-) if not isident(name[0]): return 0 for char in name: @@ -209,16 +209,14 @@ def _class_escape(source, escape): while source.next in HEXDIGITS: escape = escape + source.get() escape = escape[2:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-4:], 16) & 0xff) + return LITERAL, int(escape[-4:], 16) & CHARMASK elif str(escape[1:2]) in OCTDIGITS: while source.next in OCTDIGITS: escape = escape + source.get() escape = escape[1:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-6:], 8) & 0xff) + return LITERAL, int(escape[-6:], 8) & CHARMASK if len(escape) == 2: - return LITERAL, escape[1] + return LITERAL, ord(escape[1]) except ValueError: pass raise error, "bogus escape: %s" % repr(escape) @@ -236,8 +234,7 @@ def _escape(source, escape, state): while source.next in HEXDIGITS: escape = escape + source.get() escape = escape[2:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-4:], 16) & 0xff) + return LITERAL, int(escape[-4:], 16) & CHARMASK elif escape[1:2] in DIGITS: while 1: group = _group(escape, state.groups) @@ -251,17 +248,14 @@ def _escape(source, escape, state): else: break escape = escape[1:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-6:], 8) & 0xff) + return LITERAL, int(escape[-6:], 8) & CHARMASK if len(escape) == 2: - return LITERAL, escape[1] + return LITERAL, ord(escape[1]) except ValueError: pass raise error, "bogus escape: %s" % repr(escape) - def _branch(pattern, items): - # form a branch operator from a set of items subpattern = SubPattern(pattern) @@ -327,7 +321,7 @@ def _parse(source, state, flags=0): continue if this and this[0] not in SPECIAL_CHARS: - subpattern.append((LITERAL, this)) + subpattern.append((LITERAL, ord(this))) elif this == "[": # character set @@ -345,7 +339,7 @@ def _parse(source, state, flags=0): elif this and this[0] == "\\": code1 = _class_escape(source, this) elif this: - code1 = LITERAL, this + code1 = LITERAL, ord(this) else: raise error, "unexpected end of regular expression" if source.match("-"): @@ -353,17 +347,15 @@ def _parse(source, state, flags=0): this = source.get() if this == "]": set.append(code1) - set.append((LITERAL, "-")) + set.append((LITERAL, ord("-"))) break else: if this[0] == "\\": code2 = _class_escape(source, this) else: - code2 = LITERAL, this + code2 = LITERAL, ord(this) if code1[0] != LITERAL or code2[0] != LITERAL: raise error, "illegal range" - if len(code1[1]) != 1 or len(code2[1]) != 1: - raise error, "illegal range" set.append((RANGE, (code1[1], code2[1]))) else: if code1[0] is IN: @@ -605,17 +597,16 @@ def parse_template(source, pattern): break if not code: this = this[1:] - # FIXME: support unicode characters! - code = LITERAL, chr(int(this[-6:], 8) & 0xff) + code = LITERAL, int(this[-6:], 8) & CHARMASK a(code) else: try: a(ESCAPES[this]) except KeyError: for c in this: - a((LITERAL, c)) + a((LITERAL, ord(c))) else: - a((LITERAL, this)) + a((LITERAL, ord(this))) return p def expand_template(template, match): @@ -623,12 +614,17 @@ def expand_template(template, match): # code instead p = [] a = p.append + sep = match.string[:0] + if type(sep) is type(""): + char = chr + else: + char = unichr for c, s in template: if c is LITERAL: - a(s) + a(char(s)) elif c is MARK: s = match.group(s) if s is None: raise error, "empty group" a(s) - return match.string[:0].join(p) + return sep.join(p) diff --git a/Lib/test/output/test_sre b/Lib/test/output/test_sre index d3732b5..10de93d 100644 --- a/Lib/test/output/test_sre +++ b/Lib/test/output/test_sre @@ -1,6 +1,5 @@ test_sre -test_support -- test failed re module pickle -test_support -- test failed re module cPickle +=== Failed incorrectly ('\\x00ffffffffffffff', '\377', 0, 'found', '\377') === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A') === Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a') === grouping error ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/') 'd:msgs/tdir/sub1/-trial/' should be 'd:msgs/tdir/sub1/-tdir/' |