summaryrefslogtreecommitdiffstats
path: root/Lib/re.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1998-07-17 20:18:49 (GMT)
committerGuido van Rossum <guido@python.org>1998-07-17 20:18:49 (GMT)
commit0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac (patch)
treea95286375214c45e7b752a7d4fbfba902a79167f /Lib/re.py
parentc364cf82283332522ebc139e2edb5c8247f61307 (diff)
downloadcpython-0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac.zip
cpython-0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac.tar.gz
cpython-0e5ab17ad34c8810f0626409fd3d1c8dc85e68ac.tar.bz2
Get a 3- to 4-fold speedup for sub()/subn(), split() and findall() by
not calling self.search(); instead, call self.code.match() directly and interpret the list of registers it returns directly. This saves the overhead of instantiating a MatchObject for each hit, basically inlining search() as well as group(). When a MatchObject is still needed, one is allocated and reused for the duration of the scan.
Diffstat (limited to 'Lib/re.py')
-rw-r--r--Lib/re.py109
1 files changed, 71 insertions, 38 deletions
diff --git a/Lib/re.py b/Lib/re.py
index c5b71b8..7d5a044 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -138,42 +138,56 @@ class RegexObject:
non-overlapping occurrences of the pattern in the source
string by the replacement repl. number is the number of
substitutions that were made."""
-
+
if count < 0:
raise error, "negative substitution count"
if count == 0:
- import sys
count = sys.maxint
- if type(repl) == type(''):
- if '\\' in repl:
- repl = lambda m, r=repl: pcre_expand(m, r)
- else:
- repl = lambda m, r=repl: r
n = 0 # Number of matches
pos = 0 # Where to start searching
lastmatch = -1 # End of last match
results = [] # Substrings making up the result
end = len(source)
+
+ if type(repl) is type(''):
+ # See if repl contains group references
+ try:
+ repl = pcre_expand(_Dummy, repl)
+ except:
+ m = MatchObject(self, source, 0, end, [])
+ repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
+ else:
+ m = None
+ else:
+ m = MatchObject(self, source, 0, end, [])
+
+ match = self.code.match
+ append = results.append
while n < count and pos <= end:
- m = self.search(source, pos)
- if not m:
+ regs = match(source, pos, end, 0)
+ if not regs:
break
- i, j = m.span(0)
+ i, j = regs[0]
if i == j == lastmatch:
# Empty match adjacent to previous match
pos = pos + 1
- results.append(source[lastmatch:pos])
+ append(source[lastmatch:pos])
continue
if pos < i:
- results.append(source[pos:i])
- results.append(repl(m))
+ append(source[pos:i])
+ if m:
+ m.pos = pos
+ m.regs = regs
+ append(repl(m))
+ else:
+ append(repl)
pos = lastmatch = j
if i == j:
# Last match was empty; don't try here again
pos = pos + 1
- results.append(source[lastmatch:pos])
+ append(source[lastmatch:pos])
n = n + 1
- results.append(source[pos:])
+ append(source[pos:])
return (string.join(results, ''), n)
def split(self, source, maxsplit=0):
@@ -183,34 +197,40 @@ class RegexObject:
if maxsplit < 0:
raise error, "negative split count"
if maxsplit == 0:
- import sys
maxsplit = sys.maxint
n = 0
pos = 0
lastmatch = 0
results = []
end = len(source)
+ match = self.code.match
+ append = results.append
while n < maxsplit:
- m = self.search(source, pos)
- if not m:
+ regs = match(source, pos, end, 0)
+ if not regs:
break
- i, j = m.span(0)
+ i, j = regs[0]
if i == j:
# Empty match
if pos >= end:
break
pos = pos+1
continue
- results.append(source[lastmatch:i])
- g = m.groups()
- if g:
- results[len(results):] = list(g)
+ append(source[lastmatch:i])
+ rest = regs[1:]
+ if rest:
+ for a, b in rest:
+ if a == -1 or b == -1:
+ group = None
+ else:
+ group = source[a:b]
+ append(group)
pos = lastmatch = j
n = n + 1
- results.append(source[lastmatch:])
+ append(source[lastmatch:])
return results
- def findall(self, string):
+ def findall(self, source):
"""Return a list of all non-overlapping matches in the string.
If one or more groups are present in the pattern, return a
@@ -221,20 +241,29 @@ class RegexObject:
"""
pos = 0
- n = len(string)
- result = []
- while pos <= n:
- m = self.search(string, pos)
- if not m:
+ end = len(source)
+ results = []
+ match = self.code.match
+ append = results.append
+ while pos <= end:
+ regs = match(source, pos, end, 0)
+ if not regs:
break
- gr = m.groups()
- if not gr:
- gr = m.group()
- elif len(gr) == 1:
- gr = gr[0]
- result.append(gr)
- pos = max(m.end(), pos+1)
- return result
+ i, j = regs[0]
+ rest = regs[1:]
+ if not rest:
+ gr = source[i:j]
+ elif len(rest) == 1:
+ a, b = rest[0]
+ gr = source[a:b]
+ else:
+ gr = []
+ for (a, b) in rest:
+ gr.append(source[a:b])
+ gr = tuple(gr)
+ append(gr)
+ pos = max(j, pos+1)
+ return results
# The following 3 functions were contributed by Mike Fletcher, and
# allow pickling and unpickling of RegexObject instances.
@@ -251,6 +280,10 @@ class RegexObject:
self.groupindex = statetuple[2]
self.code = apply(pcre_compile, statetuple)
+class _Dummy:
+ # Dummy class used by _subn_string(). Has 'group' to avoid core dump.
+ group = None
+
class MatchObject:
def __init__(self, re, string, pos, endpos, regs):