diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2001-03-22 15:50:10 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2001-03-22 15:50:10 (GMT) |
commit | b25e1ad253a4d96aea31a7a3fb78522ea354f43a (patch) | |
tree | 2cc9dc18021270ffc2d7982ecca15b6942f59413 /Lib | |
parent | 8e9972c215ea0b10f0a7516d1cded6f26296ceba (diff) | |
download | cpython-b25e1ad253a4d96aea31a7a3fb78522ea354f43a.zip cpython-b25e1ad253a4d96aea31a7a3fb78522ea354f43a.tar.gz cpython-b25e1ad253a4d96aea31a7a3fb78522ea354f43a.tar.bz2 |
sre 2.1b2 update:
- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sre.py | 22 | ||||
-rw-r--r-- | Lib/sre_compile.py | 16 | ||||
-rw-r--r-- | Lib/sre_constants.py | 20 | ||||
-rw-r--r-- | Lib/sre_parse.py | 63 | ||||
-rwxr-xr-x | Lib/test/re_tests.py | 11 | ||||
-rw-r--r-- | Lib/test/test_sre.py | 2 |
6 files changed, 102 insertions, 32 deletions
@@ -23,6 +23,8 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall", "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", "UNICODE", "error" ] +__version__ = "2.1b2" + # this module works under 1.5.2 and later. don't use string methods import string @@ -90,6 +92,7 @@ def compile(pattern, flags=0): def purge(): "Clear the regular expression cache" _cache.clear() + _cache_repl.clear() def template(pattern, flags=0): "Compile a template pattern, returning a pattern object" @@ -111,6 +114,8 @@ def escape(pattern): # internals _cache = {} +_cache_repl = {} + _MAXCACHE = 100 def _join(seq, sep): @@ -134,6 +139,21 @@ def _compile(*key): _cache[key] = p return p +def _compile_repl(*key): + # internal: compile replacement pattern + p = _cache_repl.get(key) + if p is not None: + return p + repl, pattern = key + try: + p = sre_parse.parse_template(repl, pattern) + except error, v: + raise error, v # invalid expression + if len(_cache_repl) >= _MAXCACHE: + _cache_repl.clear() + _cache_repl[key] = p + return p + def _expand(pattern, match, template): # internal: match.expand implementation hook template = sre_parse.parse_template(template, pattern) @@ -148,7 +168,7 @@ def _subn(pattern, template, string, count=0): if callable(template): filter = template else: - template = sre_parse.parse_template(template, pattern) + template = _compile_repl(template, pattern) def filter(match, template=template): return sre_parse.expand_template(template, match) n = i = 0 diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index ab2a2cc..44cb23e 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -105,9 +105,12 @@ def _compile(code, pattern, flags): elif op is AT: emit(OPCODES[op]) if flags & SRE_FLAG_MULTILINE: - emit(ATCODES[AT_MULTILINE.get(av, av)]) - else: - emit(ATCODES[av]) + av = AT_MULTILINE.get(av, av) + if flags & SRE_FLAG_LOCALE: + av = AT_LOCALE.get(av, av) + elif flags & SRE_FLAG_UNICODE: + av = AT_UNICODE.get(av, av) + emit(ATCODES[av]) elif op is BRANCH: emit(OPCODES[op]) tail = [] @@ -124,11 +127,10 @@ def _compile(code, pattern, flags): elif op is CATEGORY: emit(OPCODES[op]) if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) + av = CH_LOCALE[av] elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) - else: - emit(CHCODES[av]) + av = CH_UNICODE[av] + emit(CHCODES[av]) elif op is GROUPREF: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index b429a33..bbe7880 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -11,7 +11,7 @@ # update when constants are added or removed -MAGIC = 20010115 +MAGIC = 20010320 # max code word in this release @@ -67,6 +67,10 @@ AT_NON_BOUNDARY = "at_non_boundary" AT_END = "at_end" AT_END_LINE = "at_end_line" AT_END_STRING = "at_end_string" +AT_LOC_BOUNDARY = "at_loc_boundary" +AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" +AT_UNI_BOUNDARY = "at_uni_boundary" +AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" # categories CATEGORY_DIGIT = "category_digit" @@ -119,7 +123,9 @@ OPCODES = [ ATCODES = [ AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING + AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, + AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, + AT_UNI_NON_BOUNDARY ] CHCODES = [ @@ -157,6 +163,16 @@ AT_MULTILINE = { AT_END: AT_END_LINE } +AT_LOCALE = { + AT_BOUNDARY: AT_LOC_BOUNDARY, + AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY +} + +AT_UNICODE = { + AT_BOUNDARY: AT_UNI_BOUNDARY, + AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY +} + CH_LOCALE = { CATEGORY_DIGIT: CATEGORY_DIGIT, CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 3840365..44626bd 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -638,6 +638,16 @@ def parse_template(source, pattern): s = Tokenizer(source) p = [] a = p.append + def literal(literal, p=p): + if p and p[-1][0] is LITERAL: + p[-1] = LITERAL, p[-1][1] + literal + else: + p.append((LITERAL, literal)) + sep = source[:0] + if type(sep) is type(""): + char = chr + else: + char = unichr while 1: this = s.get() if this is None: @@ -681,33 +691,42 @@ def parse_template(source, pattern): break if not code: this = this[1:] - code = LITERAL, atoi(this[-6:], 8) & 0xff - a(code) + code = LITERAL, char(atoi(this[-6:], 8) & 0xff) + if code[0] is LITERAL: + literal(code[1]) + else: + a(code) else: try: - a(ESCAPES[this]) + this = char(ESCAPES[this][1]) except KeyError: - for c in this: - a((LITERAL, ord(c))) + pass + literal(this) else: - a((LITERAL, ord(this))) - return p + literal(this) + # convert template to groups and literals lists + i = 0 + groups = [] + literals = [] + for c, s in p: + if c is MARK: + groups.append((i, s)) + literals.append(None) + else: + literals.append(s) + i = i + 1 + return groups, literals def expand_template(template, match): - # XXX: <fl> this is sooooo slow. drop in the slicelist code instead - p = [] - a = p.append + g = match.group sep = match.string[:0] - if type(sep) is type(""): - char = chr - else: - char = unichr - for c, s in template: - if c is LITERAL: - a(char(s)) - elif c is MARK: - s = match.group(s) + groups, literals = template + literals = literals[:] + try: + for index, group in groups: + literals[index] = s = g(group) if s is None: - raise error, "empty group" - a(s) - return string.join(p, sep) + raise IndexError + except IndexError: + raise error, "empty group" + return string.join(literals, sep) diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py index aacd916..7c5dc89 100755 --- a/Lib/test/re_tests.py +++ b/Lib/test/re_tests.py @@ -639,3 +639,14 @@ xyzabc # bug 130748: ^* should be an error (nothing to repeat) (r'^*', '', SYNTAX_ERROR), ] + +try: + u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'") +except SyntaxError: + pass +else: + tests.extend([ + # bug 410271: \b broken under locales + (r'\b.\b', 'a', SUCCEED, 'found', 'a'), + (r'(?u)\b.\b', u, SUCCEED, 'found', u), + ]) diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py index 88c0d62..031cda6 100644 --- a/Lib/test/test_sre.py +++ b/Lib/test/test_sre.py @@ -329,6 +329,8 @@ for t in tests: u = unicode(s, "latin-1") except NameError: pass + except TypeError: + continue # skip unicode test strings else: result=obj.search(u) if result==None: |