summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2001-03-22 15:50:10 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2001-03-22 15:50:10 (GMT)
commitb25e1ad253a4d96aea31a7a3fb78522ea354f43a (patch)
tree2cc9dc18021270ffc2d7982ecca15b6942f59413
parent8e9972c215ea0b10f0a7516d1cded6f26296ceba (diff)
downloadcpython-b25e1ad253a4d96aea31a7a3fb78522ea354f43a.zip
cpython-b25e1ad253a4d96aea31a7a3fb78522ea354f43a.tar.gz
cpython-b25e1ad253a4d96aea31a7a3fb78522ea354f43a.tar.bz2
sre 2.1b2 update:
- take locale into account for word boundary anchors (#410271) - restored 2.0's *? behaviour (#233283, #408936 and others) - speed up re.sub/re.subn
-rw-r--r--Lib/sre.py22
-rw-r--r--Lib/sre_compile.py16
-rw-r--r--Lib/sre_constants.py20
-rw-r--r--Lib/sre_parse.py63
-rwxr-xr-xLib/test/re_tests.py11
-rw-r--r--Lib/test/test_sre.py2
-rw-r--r--Modules/_sre.c74
-rw-r--r--Modules/sre_constants.h6
8 files changed, 165 insertions, 49 deletions
diff --git a/Lib/sre.py b/Lib/sre.py
index 48d390a..6706fac 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -23,6 +23,8 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall",
"U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
"UNICODE", "error" ]
+__version__ = "2.1b2"
+
# this module works under 1.5.2 and later. don't use string methods
import string
@@ -90,6 +92,7 @@ def compile(pattern, flags=0):
def purge():
"Clear the regular expression cache"
_cache.clear()
+ _cache_repl.clear()
def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object"
@@ -111,6 +114,8 @@ def escape(pattern):
# internals
_cache = {}
+_cache_repl = {}
+
_MAXCACHE = 100
def _join(seq, sep):
@@ -134,6 +139,21 @@ def _compile(*key):
_cache[key] = p
return p
+def _compile_repl(*key):
+ # internal: compile replacement pattern
+ p = _cache_repl.get(key)
+ if p is not None:
+ return p
+ repl, pattern = key
+ try:
+ p = sre_parse.parse_template(repl, pattern)
+ except error, v:
+ raise error, v # invalid expression
+ if len(_cache_repl) >= _MAXCACHE:
+ _cache_repl.clear()
+ _cache_repl[key] = p
+ return p
+
def _expand(pattern, match, template):
# internal: match.expand implementation hook
template = sre_parse.parse_template(template, pattern)
@@ -148,7 +168,7 @@ def _subn(pattern, template, string, count=0):
if callable(template):
filter = template
else:
- template = sre_parse.parse_template(template, pattern)
+ template = _compile_repl(template, pattern)
def filter(match, template=template):
return sre_parse.expand_template(template, match)
n = i = 0
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index ab2a2cc..44cb23e 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -105,9 +105,12 @@ def _compile(code, pattern, flags):
elif op is AT:
emit(OPCODES[op])
if flags & SRE_FLAG_MULTILINE:
- emit(ATCODES[AT_MULTILINE.get(av, av)])
- else:
- emit(ATCODES[av])
+ av = AT_MULTILINE.get(av, av)
+ if flags & SRE_FLAG_LOCALE:
+ av = AT_LOCALE.get(av, av)
+ elif flags & SRE_FLAG_UNICODE:
+ av = AT_UNICODE.get(av, av)
+ emit(ATCODES[av])
elif op is BRANCH:
emit(OPCODES[op])
tail = []
@@ -124,11 +127,10 @@ def _compile(code, pattern, flags):
elif op is CATEGORY:
emit(OPCODES[op])
if flags & SRE_FLAG_LOCALE:
- emit(CHCODES[CH_LOCALE[av]])
+ av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE:
- emit(CHCODES[CH_UNICODE[av]])
- else:
- emit(CHCODES[av])
+ av = CH_UNICODE[av]
+ emit(CHCODES[av])
elif op is GROUPREF:
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index b429a33..bbe7880 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -11,7 +11,7 @@
# update when constants are added or removed
-MAGIC = 20010115
+MAGIC = 20010320
# max code word in this release
@@ -67,6 +67,10 @@ AT_NON_BOUNDARY = "at_non_boundary"
AT_END = "at_end"
AT_END_LINE = "at_end_line"
AT_END_STRING = "at_end_string"
+AT_LOC_BOUNDARY = "at_loc_boundary"
+AT_LOC_NON_BOUNDARY = "at_loc_non_boundary"
+AT_UNI_BOUNDARY = "at_uni_boundary"
+AT_UNI_NON_BOUNDARY = "at_uni_non_boundary"
# categories
CATEGORY_DIGIT = "category_digit"
@@ -119,7 +123,9 @@ OPCODES = [
ATCODES = [
AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
- AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING
+ AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING,
+ AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY,
+ AT_UNI_NON_BOUNDARY
]
CHCODES = [
@@ -157,6 +163,16 @@ AT_MULTILINE = {
AT_END: AT_END_LINE
}
+AT_LOCALE = {
+ AT_BOUNDARY: AT_LOC_BOUNDARY,
+ AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
+}
+
+AT_UNICODE = {
+ AT_BOUNDARY: AT_UNI_BOUNDARY,
+ AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
+}
+
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 3840365..44626bd 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -638,6 +638,16 @@ def parse_template(source, pattern):
s = Tokenizer(source)
p = []
a = p.append
+ def literal(literal, p=p):
+ if p and p[-1][0] is LITERAL:
+ p[-1] = LITERAL, p[-1][1] + literal
+ else:
+ p.append((LITERAL, literal))
+ sep = source[:0]
+ if type(sep) is type(""):
+ char = chr
+ else:
+ char = unichr
while 1:
this = s.get()
if this is None:
@@ -681,33 +691,42 @@ def parse_template(source, pattern):
break
if not code:
this = this[1:]
- code = LITERAL, atoi(this[-6:], 8) & 0xff
- a(code)
+ code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
+ if code[0] is LITERAL:
+ literal(code[1])
+ else:
+ a(code)
else:
try:
- a(ESCAPES[this])
+ this = char(ESCAPES[this][1])
except KeyError:
- for c in this:
- a((LITERAL, ord(c)))
+ pass
+ literal(this)
else:
- a((LITERAL, ord(this)))
- return p
+ literal(this)
+ # convert template to groups and literals lists
+ i = 0
+ groups = []
+ literals = []
+ for c, s in p:
+ if c is MARK:
+ groups.append((i, s))
+ literals.append(None)
+ else:
+ literals.append(s)
+ i = i + 1
+ return groups, literals
def expand_template(template, match):
- # XXX: <fl> this is sooooo slow. drop in the slicelist code instead
- p = []
- a = p.append
+ g = match.group
sep = match.string[:0]
- if type(sep) is type(""):
- char = chr
- else:
- char = unichr
- for c, s in template:
- if c is LITERAL:
- a(char(s))
- elif c is MARK:
- s = match.group(s)
+ groups, literals = template
+ literals = literals[:]
+ try:
+ for index, group in groups:
+ literals[index] = s = g(group)
if s is None:
- raise error, "empty group"
- a(s)
- return string.join(p, sep)
+ raise IndexError
+ except IndexError:
+ raise error, "empty group"
+ return string.join(literals, sep)
diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py
index aacd916..7c5dc89 100755
--- a/Lib/test/re_tests.py
+++ b/Lib/test/re_tests.py
@@ -639,3 +639,14 @@ xyzabc
# bug 130748: ^* should be an error (nothing to repeat)
(r'^*', '', SYNTAX_ERROR),
]
+
+try:
+ u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'")
+except SyntaxError:
+ pass
+else:
+ tests.extend([
+ # bug 410271: \b broken under locales
+ (r'\b.\b', 'a', SUCCEED, 'found', 'a'),
+ (r'(?u)\b.\b', u, SUCCEED, 'found', u),
+ ])
diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py
index 88c0d62..031cda6 100644
--- a/Lib/test/test_sre.py
+++ b/Lib/test/test_sre.py
@@ -329,6 +329,8 @@ for t in tests:
u = unicode(s, "latin-1")
except NameError:
pass
+ except TypeError:
+ continue # skip unicode test strings
else:
result=obj.search(u)
if result==None:
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 63e4ef3..8811038 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -24,8 +24,9 @@
* 2000-10-24 fl really fixed assert_not; reset groups in findall
* 2000-12-21 fl fixed memory leak in groupdict
* 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
- * 2001-01-15 fl avoid recursion for MIN_UTIL; fixed uppercase literal bug
+ * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
* 2001-01-16 fl fixed memory leak in pattern destructor
+ * 2001-03-20 fl lots of fixes for 2.1b2
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
@@ -40,7 +41,7 @@
#ifndef SRE_RECURSIVE
-char copyright[] = " SRE 2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
+char copyright[] = " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB ";
#include "Python.h"
@@ -141,11 +142,6 @@ static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127 };
-static unsigned int sre_lower(unsigned int ch)
-{
- return ((ch) < 128 ? sre_char_lower[ch] : ch);
-}
-
#define SRE_IS_DIGIT(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
#define SRE_IS_SPACE(ch)\
@@ -157,30 +153,39 @@ static unsigned int sre_lower(unsigned int ch)
#define SRE_IS_WORD(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
-/* locale-specific character predicates */
-
-static unsigned int sre_lower_locale(unsigned int ch)
+static unsigned int sre_lower(unsigned int ch)
{
- return ((ch) < 256 ? tolower((ch)) : ch);
+ return ((ch) < 128 ? sre_char_lower[ch] : ch);
}
+
+/* locale-specific character predicates */
+
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
+static unsigned int sre_lower_locale(unsigned int ch)
+{
+ return ((ch) < 256 ? tolower((ch)) : ch);
+}
+
/* unicode-specific character predicates */
#if defined(HAVE_UNICODE)
-static unsigned int sre_lower_unicode(unsigned int ch)
-{
- return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
-}
+
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
+
+static unsigned int sre_lower_unicode(unsigned int ch)
+{
+ return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
+}
+
#endif
LOCAL(int)
@@ -418,6 +423,42 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
this = ((void*) ptr < state->end) ?
SRE_IS_WORD((int) ptr[0]) : 0;
return this == that;
+
+ case SRE_AT_LOC_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+ return this != that;
+
+ case SRE_AT_LOC_NON_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+ return this == that;
+
+ case SRE_AT_UNI_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+ return this != that;
+
+ case SRE_AT_UNI_NON_BOUNDARY:
+ if (state->beginning == state->end)
+ return 0;
+ that = ((void*) ptr > state->beginning) ?
+ SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+ this = ((void*) ptr < state->end) ?
+ SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+ return this == that;
}
return 0;
@@ -1037,7 +1078,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
/* see if the tail matches */
state->repeat = rp->prev;
- if (rp->pattern[2] == 65535) {
+ /* FIXME: the following fix doesn't always work (#133283) */
+ if (0 && rp->pattern[2] == 65535) {
/* unbounded repeat */
for (;;) {
i = SRE_MATCH(state, pattern, level + 1);
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index c6850ad..73bcb34 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20010115
+#define SRE_MAGIC 20010320
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -49,6 +49,10 @@
#define SRE_AT_END 5
#define SRE_AT_END_LINE 6
#define SRE_AT_END_STRING 7
+#define SRE_AT_LOC_BOUNDARY 8
+#define SRE_AT_LOC_NON_BOUNDARY 9
+#define SRE_AT_UNI_BOUNDARY 10
+#define SRE_AT_UNI_NON_BOUNDARY 11
#define SRE_CATEGORY_DIGIT 0
#define SRE_CATEGORY_NOT_DIGIT 1
#define SRE_CATEGORY_SPACE 2