summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2010-08-09 05:56:50 (GMT)
committerRaymond Hettinger <python@rcn.com>2010-08-09 05:56:50 (GMT)
commit31022301b59bb84a85be13e8da02b898ae07c7fc (patch)
tree36c163d37e7f525ab673d7f7238e8248d4b8851a
parent4f859ed9c7feba6e403895c4044608d29efd2ba9 (diff)
downloadcpython-31022301b59bb84a85be13e8da02b898ae07c7fc.zip
cpython-31022301b59bb84a85be13e8da02b898ae07c7fc.tar.gz
cpython-31022301b59bb84a85be13e8da02b898ae07c7fc.tar.bz2
Revert 83784 adding functools.lru_cache() to the re module.
The problem is that the re module is imported by sysconfig and re needs functools which uses collections.OrderedDict() but the _collectionsmodule.c code is not yet constructed at this point in the build. The likely best solution will be to include _collections as part of the static build before the rest of the boot-strapping.
-rw-r--r--Lib/re.py32
-rw-r--r--Lib/test/test_re.py62
-rw-r--r--Misc/NEWS4
3 files changed, 84 insertions, 14 deletions
diff --git a/Lib/re.py b/Lib/re.py
index 269eaef..2f1a76e 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -118,7 +118,6 @@ This module also defines an exception 'error'.
import sys
import sre_compile
import sre_parse
-import functools
# public symbols
__all__ = [ "match", "search", "sub", "subn", "split", "findall",
@@ -206,9 +205,9 @@ def compile(pattern, flags=0):
return _compile(pattern, flags)
def purge():
- "Clear the regular expression caches"
- _compile_typed.clear()
- _compile_repl.clear()
+ "Clear the regular expression cache"
+ _cache.clear()
+ _cache_repl.clear()
def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object"
@@ -290,12 +289,12 @@ def _shrink_cache(cache_dict, max_length, divisor=5):
# Ignore problems if the cache changed from another thread.
pass
-def _compile(*args):
- return _compile_typed(type(args[0]), *args)
-
-@functools.lru_cache(maxsize=_MAXCACHE)
-def _compile_typed(type, *key):
+def _compile(*key):
# internal: compile pattern
+ cachekey = (type(key[0]),) + key
+ p = _cache.get(cachekey)
+ if p is not None:
+ return p
pattern, flags = key
if isinstance(pattern, _pattern_type):
if flags:
@@ -304,14 +303,23 @@ def _compile_typed(type, *key):
return pattern
if not sre_compile.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern")
- return sre_compile.compile(pattern, flags)
+ p = sre_compile.compile(pattern, flags)
+ if len(_cache) >= _MAXCACHE:
+ _shrink_cache(_cache, _MAXCACHE)
+ _cache[cachekey] = p
return p
-@functools.lru_cache(maxsize=_MAXCACHE)
def _compile_repl(*key):
# internal: compile replacement pattern
+ p = _cache_repl.get(key)
+ if p is not None:
+ return p
repl, pattern = key
- return sre_parse.parse_template(repl, pattern)
+ p = sre_parse.parse_template(repl, pattern)
+ if len(_cache_repl) >= _MAXCACHE:
+ _shrink_cache(_cache_repl, _MAXCACHE)
+ _cache_repl[key] = p
+ return p
def _expand(pattern, match, template):
# internal: match.expand implementation hook
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 96a83b8..6b11685 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -875,8 +875,70 @@ def run_re_tests():
print('=== Fails on unicode-sensitive match', t)
+class ReCacheTests(unittest.TestCase):
+ """These tests are specific to the re._shrink_cache implementation."""
+
+ def setUp(self):
+ self._orig_maxcache = re._MAXCACHE
+
+ def tearDown(self):
+ re._MAXCACHE = self._orig_maxcache
+
+ def test_compile_cache_overflow(self):
+ # NOTE: If a profiler or debugger is tracing code and compiling
+ # regular expressions while tracing through this test... expect
+ # the test to fail. This test is not concurrency safe.
+
+ # Explicitly fill the caches.
+ re._MAXCACHE = 20
+ max_cache = re._MAXCACHE
+ unique_chars = tuple(chr(char_num) for char_num in
+ range(b'a'[0], b'a'[0]+max_cache))
+ re._cache.clear()
+ for char in unique_chars:
+ re._compile(char, 0)
+ self.assertEqual(max_cache, len(re._cache))
+ re._cache_repl.clear()
+ for char in unique_chars:
+ re._compile_repl(char*2, char)
+ self.assertEqual(max_cache, len(re._cache_repl))
+
+ # Overflow both caches and make sure they have extra room left
+ # afterwards as well as having more than a single entry.
+ re._compile('A', 0)
+ self.assertLess(len(re._cache), max_cache)
+ self.assertGreater(len(re._cache), 1)
+ re._compile_repl('A', 'A')
+ self.assertLess(len(re._cache_repl), max_cache)
+ self.assertGreater(len(re._cache_repl), 1)
+
+ def test_shrink_cache_at_limit(self):
+ cache = dict(zip(range(6), range(6)))
+ re._shrink_cache(cache, 6, divisor=3)
+ self.assertEqual(4, len(cache))
+
+ def test_shrink_cache_empty(self):
+ cache = {}
+ re._shrink_cache(cache, 6, divisor=3)
+ # Cache was empty, make sure we didn't raise an exception.
+ self.assertEqual(0, len(cache))
+
+ def test_shrink_cache_overflowing(self):
+ cache = dict(zip(range(6), range(6)))
+ re._shrink_cache(cache, 4, divisor=2)
+ # Cache was larger than the maximum, be sure we shrunk to smaller.
+ self.assertEqual(2, len(cache))
+
+ def test_shrink_cache_underflow(self):
+ cache = dict(zip(range(6), range(6)))
+ # No shrinking to do.
+ re._shrink_cache(cache, 9, divisor=3)
+ self.assertEqual(6, len(cache))
+
+
def test_main():
run_unittest(ReTests)
+ run_unittest(ReCacheTests)
run_re_tests()
if __name__ == "__main__":
diff --git a/Misc/NEWS b/Misc/NEWS
index 57bcf8e..00188d4 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -697,8 +697,8 @@ Library
- The default size of the re module's compiled regular expression cache has been
increased from 100 to 500 and the cache replacement policy has changed from
- simply clearing the entire cache on overflow to forgetting the least recently
- used cached compiled regular expressions. This is a performance win for
+ simply clearing the entire cache on overflow to randomly forgetting 20% of the
+ existing cached compiled regular expressions. This is a performance win for
applications that use a lot of regular expressions and limits the impact of
the performance hit anytime the cache is exceeded.