diff options
author | Guido van Rossum <guido@python.org> | 1998-07-17 20:18:49 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1998-07-17 20:18:49 (GMT) |
commit | 0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac (patch) | |
tree | a95286375214c45e7b752a7d4fbfba902a79167f /Lib | |
parent | c364cf82283332522ebc139e2edb5c8247f61307 (diff) | |
download | cpython-0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac.zip cpython-0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac.tar.gz cpython-0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac.tar.bz2 |
Get a 3- to 4-fold speedup for sub()/subn(), split() and findall() by
not calling self.search(); instead, call self.code.match() directly
and interpret the list of registers it returns directly. This saves
the overhead of instantiating a MatchObject for each hit, basically
inlining search() as well as group(). When a MatchObject is still
needed, one is allocated and reused for the duration of the scan.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/re.py | 109 |
1 files changed, 71 insertions, 38 deletions
@@ -138,42 +138,56 @@ class RegexObject: non-overlapping occurrences of the pattern in the source string by the replacement repl. number is the number of substitutions that were made.""" - + if count < 0: raise error, "negative substitution count" if count == 0: - import sys count = sys.maxint - if type(repl) == type(''): - if '\\' in repl: - repl = lambda m, r=repl: pcre_expand(m, r) - else: - repl = lambda m, r=repl: r n = 0 # Number of matches pos = 0 # Where to start searching lastmatch = -1 # End of last match results = [] # Substrings making up the result end = len(source) + + if type(repl) is type(''): + # See if repl contains group references + try: + repl = pcre_expand(_Dummy, repl) + except: + m = MatchObject(self, source, 0, end, []) + repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl) + else: + m = None + else: + m = MatchObject(self, source, 0, end, []) + + match = self.code.match + append = results.append while n < count and pos <= end: - m = self.search(source, pos) - if not m: + regs = match(source, pos, end, 0) + if not regs: break - i, j = m.span(0) + i, j = regs[0] if i == j == lastmatch: # Empty match adjacent to previous match pos = pos + 1 - results.append(source[lastmatch:pos]) + append(source[lastmatch:pos]) continue if pos < i: - results.append(source[pos:i]) - results.append(repl(m)) + append(source[pos:i]) + if m: + m.pos = pos + m.regs = regs + append(repl(m)) + else: + append(repl) pos = lastmatch = j if i == j: # Last match was empty; don't try here again pos = pos + 1 - results.append(source[lastmatch:pos]) + append(source[lastmatch:pos]) n = n + 1 - results.append(source[pos:]) + append(source[pos:]) return (string.join(results, ''), n) def split(self, source, maxsplit=0): @@ -183,34 +197,40 @@ class RegexObject: if maxsplit < 0: raise error, "negative split count" if maxsplit == 0: - import sys maxsplit = sys.maxint n = 0 pos = 0 lastmatch = 0 results = [] end = len(source) + match = self.code.match + append = results.append while n < maxsplit: - m = self.search(source, pos) - if not m: + regs = match(source, pos, end, 0) + if not regs: break - i, j = m.span(0) + i, j = regs[0] if i == j: # Empty match if pos >= end: break pos = pos+1 continue - results.append(source[lastmatch:i]) - g = m.groups() - if g: - results[len(results):] = list(g) + append(source[lastmatch:i]) + rest = regs[1:] + if rest: + for a, b in rest: + if a == -1 or b == -1: + group = None + else: + group = source[a:b] + append(group) pos = lastmatch = j n = n + 1 - results.append(source[lastmatch:]) + append(source[lastmatch:]) return results - def findall(self, string): + def findall(self, source): """Return a list of all non-overlapping matches in the string. If one or more groups are present in the pattern, return a @@ -221,20 +241,29 @@ class RegexObject: """ pos = 0 - n = len(string) - result = [] - while pos <= n: - m = self.search(string, pos) - if not m: + end = len(source) + results = [] + match = self.code.match + append = results.append + while pos <= end: + regs = match(source, pos, end, 0) + if not regs: break - gr = m.groups() - if not gr: - gr = m.group() - elif len(gr) == 1: - gr = gr[0] - result.append(gr) - pos = max(m.end(), pos+1) - return result + i, j = regs[0] + rest = regs[1:] + if not rest: + gr = source[i:j] + elif len(rest) == 1: + a, b = rest[0] + gr = source[a:b] + else: + gr = [] + for (a, b) in rest: + gr.append(source[a:b]) + gr = tuple(gr) + append(gr) + pos = max(j, pos+1) + return results # The following 3 functions were contributed by Mike Fletcher, and # allow pickling and unpickling of RegexObject instances. @@ -251,6 +280,10 @@ class RegexObject: self.groupindex = statetuple[2] self.code = apply(pcre_compile, statetuple) +class _Dummy: + # Dummy class used by _subn_string(). Has 'group' to avoid core dump. + group = None + class MatchObject: def __init__(self, re, string, pos, endpos, regs): |