summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/sre.py13
-rw-r--r--Lib/sre_compile.py8
-rw-r--r--Lib/sre_parse.py66
-rw-r--r--Lib/test/output/test_sre3
-rw-r--r--Modules/_sre.c49
5 files changed, 73 insertions, 66 deletions
diff --git a/Lib/sre.py b/Lib/sre.py
index 97a5140..5e6aeeb 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -89,6 +89,10 @@ def _compile(pattern, flags=0):
_cache[key] = p
return p
+def purge():
+ # clear pattern cache
+ _cache.clear()
+
def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0]
@@ -142,3 +146,12 @@ def _split(pattern, string, maxsplit=0):
n = n + 1
append(string[i:])
return s
+
+# register myself for pickling
+
+import copy_reg
+
+def _pickle(p):
+ return _compile, (p.pattern, p.flags)
+
+copy_reg.pickle(type(_compile("")), _pickle, _compile)
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 0829c00..e48a7eb 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -31,15 +31,15 @@ def _compile(code, pattern, flags):
emit(OPCODES[OP_IGNORE[op]])
else:
emit(OPCODES[op])
- emit(ord(av))
+ emit(av)
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
def fixup(literal, flags=flags):
- return _sre.getlower(ord(literal), flags)
+ return _sre.getlower(literal, flags)
else:
emit(OPCODES[op])
- fixup = ord
+ fixup = lambda x: x
skip = len(code); emit(0)
for op, av in av:
emit(OPCODES[op])
@@ -165,7 +165,7 @@ def _compile_info(code, pattern, flags):
if not (flags & SRE_FLAG_IGNORECASE):
for op, av in pattern.data:
if op is LITERAL:
- prefix.append(ord(av))
+ prefix.append(av)
else:
break
# add an info block
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index d3dbe00..fb954e9 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -19,6 +19,9 @@ from sre_constants import *
# FIXME: should be 65535, but the arraymodule is still broken
MAXREPEAT = 32767
+# FIXME: same here
+CHARMASK = 0x7fff
+
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
@@ -30,14 +33,14 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF")
WHITESPACE = string.whitespace
ESCAPES = {
- r"\a": (LITERAL, chr(7)),
- r"\b": (LITERAL, chr(8)),
- r"\f": (LITERAL, chr(12)),
- r"\n": (LITERAL, chr(10)),
- r"\r": (LITERAL, chr(13)),
- r"\t": (LITERAL, chr(9)),
- r"\v": (LITERAL, chr(11)),
- r"\\": (LITERAL, "\\")
+ r"\a": (LITERAL, 7),
+ r"\b": (LITERAL, 8),
+ r"\f": (LITERAL, 12),
+ r"\n": (LITERAL, 10),
+ r"\r": (LITERAL, 13),
+ r"\t": (LITERAL, 9),
+ r"\v": (LITERAL, 11),
+ r"\\": (LITERAL, ord("\\"))
}
CATEGORIES = {
@@ -176,9 +179,6 @@ def isdigit(char):
def isname(name):
# check that group name is a valid string
- # FIXME: <fl> this code is really lame. should use a regular
- # expression instead, but I seem to have certain bootstrapping
- # problems here ;-)
if not isident(name[0]):
return 0
for char in name:
@@ -209,16 +209,14 @@ def _class_escape(source, escape):
while source.next in HEXDIGITS:
escape = escape + source.get()
escape = escape[2:]
- # FIXME: support unicode characters!
- return LITERAL, chr(int(escape[-4:], 16) & 0xff)
+ return LITERAL, int(escape[-4:], 16) & CHARMASK
elif str(escape[1:2]) in OCTDIGITS:
while source.next in OCTDIGITS:
escape = escape + source.get()
escape = escape[1:]
- # FIXME: support unicode characters!
- return LITERAL, chr(int(escape[-6:], 8) & 0xff)
+ return LITERAL, int(escape[-6:], 8) & CHARMASK
if len(escape) == 2:
- return LITERAL, escape[1]
+ return LITERAL, ord(escape[1])
except ValueError:
pass
raise error, "bogus escape: %s" % repr(escape)
@@ -236,8 +234,7 @@ def _escape(source, escape, state):
while source.next in HEXDIGITS:
escape = escape + source.get()
escape = escape[2:]
- # FIXME: support unicode characters!
- return LITERAL, chr(int(escape[-4:], 16) & 0xff)
+ return LITERAL, int(escape[-4:], 16) & CHARMASK
elif escape[1:2] in DIGITS:
while 1:
group = _group(escape, state.groups)
@@ -251,17 +248,14 @@ def _escape(source, escape, state):
else:
break
escape = escape[1:]
- # FIXME: support unicode characters!
- return LITERAL, chr(int(escape[-6:], 8) & 0xff)
+ return LITERAL, int(escape[-6:], 8) & CHARMASK
if len(escape) == 2:
- return LITERAL, escape[1]
+ return LITERAL, ord(escape[1])
except ValueError:
pass
raise error, "bogus escape: %s" % repr(escape)
-
def _branch(pattern, items):
-
# form a branch operator from a set of items
subpattern = SubPattern(pattern)
@@ -327,7 +321,7 @@ def _parse(source, state, flags=0):
continue
if this and this[0] not in SPECIAL_CHARS:
- subpattern.append((LITERAL, this))
+ subpattern.append((LITERAL, ord(this)))
elif this == "[":
# character set
@@ -345,7 +339,7 @@ def _parse(source, state, flags=0):
elif this and this[0] == "\\":
code1 = _class_escape(source, this)
elif this:
- code1 = LITERAL, this
+ code1 = LITERAL, ord(this)
else:
raise error, "unexpected end of regular expression"
if source.match("-"):
@@ -353,17 +347,15 @@ def _parse(source, state, flags=0):
this = source.get()
if this == "]":
set.append(code1)
- set.append((LITERAL, "-"))
+ set.append((LITERAL, ord("-")))
break
else:
if this[0] == "\\":
code2 = _class_escape(source, this)
else:
- code2 = LITERAL, this
+ code2 = LITERAL, ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
raise error, "illegal range"
- if len(code1[1]) != 1 or len(code2[1]) != 1:
- raise error, "illegal range"
set.append((RANGE, (code1[1], code2[1])))
else:
if code1[0] is IN:
@@ -605,17 +597,16 @@ def parse_template(source, pattern):
break
if not code:
this = this[1:]
- # FIXME: support unicode characters!
- code = LITERAL, chr(int(this[-6:], 8) & 0xff)
+ code = LITERAL, int(this[-6:], 8) & CHARMASK
a(code)
else:
try:
a(ESCAPES[this])
except KeyError:
for c in this:
- a((LITERAL, c))
+ a((LITERAL, ord(c)))
else:
- a((LITERAL, this))
+ a((LITERAL, ord(this)))
return p
def expand_template(template, match):
@@ -623,12 +614,17 @@ def expand_template(template, match):
# code instead
p = []
a = p.append
+ sep = match.string[:0]
+ if type(sep) is type(""):
+ char = chr
+ else:
+ char = unichr
for c, s in template:
if c is LITERAL:
- a(s)
+ a(char(s))
elif c is MARK:
s = match.group(s)
if s is None:
raise error, "empty group"
a(s)
- return match.string[:0].join(p)
+ return sep.join(p)
diff --git a/Lib/test/output/test_sre b/Lib/test/output/test_sre
index d3732b5..10de93d 100644
--- a/Lib/test/output/test_sre
+++ b/Lib/test/output/test_sre
@@ -1,6 +1,5 @@
test_sre
-test_support -- test failed re module pickle
-test_support -- test failed re module cPickle
+=== Failed incorrectly ('\\x00ffffffffffffff', '\377', 0, 'found', '\377')
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
=== Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
=== grouping error ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/') 'd:msgs/tdir/sub1/-trial/' should be 'd:msgs/tdir/sub1/-tdir/'
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 22b6c73..268c5dd 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -20,7 +20,7 @@
* 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
* 00-06-30 fl tuning, fast search (0.9.3)
- * 00-06-30 fl added assert (lookahead) primitives (0.9.4)
+ * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@@ -339,7 +339,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
}
LOCAL(int)
-SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
+SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
{
/* check if character is a member of the given set */
@@ -356,13 +356,13 @@ SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
return !ok;
case SRE_OP_LITERAL:
- if (ch == (SRE_CHAR) set[0])
+ if (ch == set[0])
return ok;
set++;
break;
case SRE_OP_RANGE:
- if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1])
+ if (set[0] <= ch && ch <= set[1])
return ok;
set += 2;
break;
@@ -455,8 +455,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_LITERAL:
/* match literal string */
/* args: <code> */
- TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) pattern[0]));
- if (ptr >= end || *ptr != (SRE_CHAR) pattern[0])
+ TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
+ if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
goto failure;
pattern++;
ptr++;
@@ -465,8 +465,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_NOT_LITERAL:
/* match anything that is not literal character */
/* args: <code> */
- TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) pattern[0]));
- if (ptr >= end || *ptr == (SRE_CHAR) pattern[0])
+ TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
+ if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
goto failure;
pattern++;
ptr++;
@@ -528,7 +528,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
break;
case SRE_OP_LITERAL_IGNORE:
- TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
+ TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
if (ptr >= end ||
state->lower(*ptr) != state->lower(*pattern))
goto failure;
@@ -537,8 +537,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
break;
case SRE_OP_NOT_LITERAL_IGNORE:
- TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
- (SRE_CHAR) *pattern));
+ TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
if (ptr >= end ||
state->lower(*ptr) == state->lower(*pattern))
goto failure;
@@ -549,7 +548,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_IN_IGNORE:
TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
if (ptr >= end
- || !SRE_MEMBER(pattern+1, (SRE_CHAR) state->lower(*ptr)))
+ || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
goto failure;
pattern += pattern[0];
ptr++;
@@ -631,9 +630,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
} else if (pattern[3] == SRE_OP_LITERAL) {
/* repeated literal */
- SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
- if (ptr >= end || *ptr != chr)
+ if (ptr >= end || (SRE_CODE) ptr[0] != chr)
break;
ptr++;
count++;
@@ -641,9 +640,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
} else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
/* repeated literal */
- SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
- if (ptr >= end || (SRE_CHAR) state->lower(*ptr) != chr)
+ if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
break;
ptr++;
count++;
@@ -651,9 +650,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
} else if (pattern[3] == SRE_OP_NOT_LITERAL) {
/* repeated non-literal */
- SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
- if (ptr >= end || *ptr == chr)
+ if (ptr >= end || (SRE_CODE) ptr[0] == chr)
break;
ptr++;
count++;
@@ -661,9 +660,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
} else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
/* repeated non-literal */
- SRE_CHAR chr = (SRE_CHAR) pattern[4];
+ SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
- if (ptr >= end || (SRE_CHAR) state->lower(*ptr) == chr)
+ if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
break;
ptr++;
count++;
@@ -712,7 +711,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */
- SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
+ SRE_CODE chr = pattern[pattern[0]+1];
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
for (;;) {
TRACE(("%8d: scan for tail match\n", PTR(ptr)));
@@ -868,7 +867,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: branch\n", PTR(ptr)));
while (*pattern) {
if (pattern[1] != SRE_OP_LITERAL ||
- (ptr < end && *ptr == (SRE_CHAR) pattern[2])) {
+ (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
TRACE(("%8d: branch check\n", PTR(ptr)));
state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1);
@@ -976,7 +975,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
end = state->end;
while (ptr < end) {
for (;;) {
- if (*ptr != (SRE_CHAR) prefix[i]) {
+ if ((SRE_CODE) ptr[0] != prefix[i]) {
if (!i)
break;
else
@@ -1008,9 +1007,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (pattern[0] == SRE_OP_LITERAL) {
/* pattern starts with a literal character. this is used for
short prefixes, and if fast search is disabled*/
- SRE_CHAR chr = (SRE_CHAR) pattern[1];
+ SRE_CODE chr = pattern[1];
for (;;) {
- while (ptr < end && *ptr != chr)
+ while (ptr < end && (SRE_CODE) ptr[0] != chr)
ptr++;
if (ptr == end)
return 0;