summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/re.py39
-rw-r--r--Lib/test/test_re.py63
-rw-r--r--Misc/NEWS7
3 files changed, 106 insertions, 3 deletions
diff --git a/Lib/re.py b/Lib/re.py
index 9bd913a..2f1a76e 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -254,7 +254,40 @@ _cache_repl = {}
_pattern_type = type(sre_compile.compile("", 0))
-_MAXCACHE = 100
+_MAXCACHE = 500
+
+def _shrink_cache(cache_dict, max_length, divisor=5):
+ """Make room in the given cache.
+
+ Args:
+ cache_dict: The cache dictionary to modify.
+ max_length: Maximum # of entries in cache_dict before it is shrunk.
+ divisor: Cache will shrink to max_length - 1/divisor*max_length items.
+ """
+ # Toss out a fraction of the entries at random to make room for new ones.
+ # A random algorithm was chosen as opposed to simply cache_dict.popitem()
+ # as popitem could penalize the same regular expression repeatedly based
+ # on its internal hash value. Being random should spread the cache miss
+ # love around.
+ cache_keys = tuple(cache_dict.keys())
+ overage = len(cache_keys) - max_length
+ if overage < 0:
+ # Cache is already within limits. Normally this should not happen
+ # but it could due to multithreading.
+ return
+ number_to_toss = max_length // divisor + overage
+ # The import is done here to avoid a circular depencency.
+ import random
+ if not hasattr(random, 'sample'):
+ # Do nothing while resolving the circular dependency:
+ # re->random->warnings->tokenize->string->re
+ return
+ for doomed_key in random.sample(cache_keys, number_to_toss):
+ try:
+ del cache_dict[doomed_key]
+ except KeyError:
+ # Ignore problems if the cache changed from another thread.
+ pass
def _compile(*key):
# internal: compile pattern
@@ -272,7 +305,7 @@ def _compile(*key):
raise TypeError("first argument must be string or compiled pattern")
p = sre_compile.compile(pattern, flags)
if len(_cache) >= _MAXCACHE:
- _cache.clear()
+ _shrink_cache(_cache, _MAXCACHE)
_cache[cachekey] = p
return p
@@ -284,7 +317,7 @@ def _compile_repl(*key):
repl, pattern = key
p = sre_parse.parse_template(repl, pattern)
if len(_cache_repl) >= _MAXCACHE:
- _cache_repl.clear()
+ _shrink_cache(_cache_repl, _MAXCACHE)
_cache_repl[key] = p
return p
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 7b0a8dd..6b11685 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -874,8 +874,71 @@ def run_re_tests():
if result is None:
print('=== Fails on unicode-sensitive match', t)
+
+class ReCacheTests(unittest.TestCase):
+ """These tests are specific to the re._shrink_cache implementation."""
+
+ def setUp(self):
+ self._orig_maxcache = re._MAXCACHE
+
+ def tearDown(self):
+ re._MAXCACHE = self._orig_maxcache
+
+ def test_compile_cache_overflow(self):
+ # NOTE: If a profiler or debugger is tracing code and compiling
+ # regular expressions while tracing through this test... expect
+ # the test to fail. This test is not concurrency safe.
+
+ # Explicitly fill the caches.
+ re._MAXCACHE = 20
+ max_cache = re._MAXCACHE
+ unique_chars = tuple(chr(char_num) for char_num in
+ range(b'a'[0], b'a'[0]+max_cache))
+ re._cache.clear()
+ for char in unique_chars:
+ re._compile(char, 0)
+ self.assertEqual(max_cache, len(re._cache))
+ re._cache_repl.clear()
+ for char in unique_chars:
+ re._compile_repl(char*2, char)
+ self.assertEqual(max_cache, len(re._cache_repl))
+
+ # Overflow both caches and make sure they have extra room left
+ # afterwards as well as having more than a single entry.
+ re._compile('A', 0)
+ self.assertLess(len(re._cache), max_cache)
+ self.assertGreater(len(re._cache), 1)
+ re._compile_repl('A', 'A')
+ self.assertLess(len(re._cache_repl), max_cache)
+ self.assertGreater(len(re._cache_repl), 1)
+
+ def test_shrink_cache_at_limit(self):
+ cache = dict(zip(range(6), range(6)))
+ re._shrink_cache(cache, 6, divisor=3)
+ self.assertEqual(4, len(cache))
+
+ def test_shrink_cache_empty(self):
+ cache = {}
+ re._shrink_cache(cache, 6, divisor=3)
+ # Cache was empty, make sure we didn't raise an exception.
+ self.assertEqual(0, len(cache))
+
+ def test_shrink_cache_overflowing(self):
+ cache = dict(zip(range(6), range(6)))
+ re._shrink_cache(cache, 4, divisor=2)
+ # Cache was larger than the maximum, be sure we shrunk to smaller.
+ self.assertEqual(2, len(cache))
+
+ def test_shrink_cache_underflow(self):
+ cache = dict(zip(range(6), range(6)))
+ # No shrinking to do.
+ re._shrink_cache(cache, 9, divisor=3)
+ self.assertEqual(6, len(cache))
+
+
def test_main():
run_unittest(ReTests)
+ run_unittest(ReCacheTests)
run_re_tests()
if __name__ == "__main__":
diff --git a/Misc/NEWS b/Misc/NEWS
index 70a646f..a6a7c20 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -473,6 +473,13 @@ C-API
Library
-------
+- The default size of the re module's compiled regular expression cache has
+ been increased from 100 to 500 and the cache replacement policy has changed
+ from simply clearing the entire cache on overflow to randomly forgetting 20%
+ of the existing cached compiled regular expressions. This is a performance
+ win for applications that use a lot of regular expressions and limits the
+ impact of the performance hit anytime the cache is exceeded.
+
- Issue #7113: Speed up loading in configparser. Patch by Ɓukasz Langa.
- Issue #9032: XML-RPC client retries the request on EPIPE error. The EPIPE