gh-91524: Speed up the regular expression substitution (#91525)

Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. Closes #91524 Primarily authored by serhiy-storchaka Serhiy Storchaka Minor-cleanups-by: Gregory P. Smith [Google] <greg@krypto.org>
author: Serhiy Storchaka <storchaka@gmail.com> 2022-10-23 22:57:30 (GMT)
committer: GitHub <noreply@github.com> 2022-10-23 22:57:30 (GMT)
commit: 75a6fadf369315b27e12f670e6295cf2c2cf7d7e (patch)
tree: cbf2004f870017f90874dc721e928da9412bab50 /Lib/re
parent: 176b6c57be70fb70fd0563813a87822545eb4bbf (diff)
download: cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.zip
cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.gz
cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.bz2
3 files changed, 21 insertions, 48 deletions
diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py
index 8d6a4ef..4515650 100644
--- a/Lib/re/__init__.py
+++ b/Lib/re/__init__.py
@@ -124,6 +124,7 @@ This module also defines an exception 'error'.
 import enum
 from . import _compiler, _parser
 import functools
+import _sre
 
 
 # public symbols
@@ -230,7 +231,7 @@ def purge():
     "Clear the regular expression caches"
     _cache.clear()
     _cache2.clear()
-    _compile_repl.cache_clear()
+    _compile_template.cache_clear()
 
 def template(pattern, flags=0):
     "Compile a template pattern, returning a Pattern object, deprecated"
@@ -328,24 +329,9 @@ def _compile(pattern, flags):
     return p
 
 @functools.lru_cache(_MAXCACHE)
-def _compile_repl(repl, pattern):
+def _compile_template(pattern, repl):
     # internal: compile replacement pattern
-    return _parser.parse_template(repl, pattern)
-
-def _expand(pattern, match, template):
-    # internal: Match.expand implementation hook
-    template = _parser.parse_template(template, pattern)
-    return _parser.expand_template(template, match)
-
-def _subx(pattern, template):
-    # internal: Pattern.sub/subn implementation helper
-    template = _compile_repl(template, pattern)
-    if not template[0] and len(template[1]) == 1:
-        # literal replacement
-        return template[1][0]
-    def filter(match, template=template):
-        return _parser.expand_template(template, match)
-    return filter
+    return _sre.template(pattern, _parser.parse_template(repl, pattern))
 
 # register myself for pickling
 
diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py
index 10ee14b..d8718d3 100644
--- a/Lib/re/_constants.py
+++ b/Lib/re/_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20220615
+MAGIC = 20221023
 
 from _sre import MAXREPEAT, MAXGROUPS
 
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index 0d9cf63..5709acb 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):
 
     return p
 
-def parse_template(source, state):
+def parse_template(source, pattern):
     # parse 're' replacement string into list of literals and
     # group references
     s = Tokenizer(source)
     sget = s.get
-    groups = []
-    literals = []
+    result = []
     literal = []
     lappend = literal.append
+    def addliteral():
+        if s.istext:
+            result.append(''.join(literal))
+        else:
+            # The tokenizer implicitly decodes bytes objects as latin-1, we must
+            # therefore re-encode the final representation.
+            result.append(''.join(literal).encode('latin-1'))
+        del literal[:]
     def addgroup(index, pos):
-        if index > state.groups:
+        if index > pattern.groups:
             raise s.error("invalid group reference %d" % index, pos)
-        if literal:
-            literals.append(''.join(literal))
-            del literal[:]
-        groups.append((len(literals), index))
-        literals.append(None)
-    groupindex = state.groupindex
+        addliteral()
+        result.append(index)
+    groupindex = pattern.groupindex
     while True:
         this = sget()
         if this is None:
@@ -1063,22 +1067,5 @@ def parse_template(source, state):
                 lappend(this)
         else:
             lappend(this)
-    if literal:
-        literals.append(''.join(literal))
-    if not isinstance(source, str):
-        # The tokenizer implicitly decodes bytes objects as latin-1, we must
-        # therefore re-encode the final representation.
-        literals = [None if s is None else s.encode('latin-1') for s in literals]
-    return groups, literals
-
-def expand_template(template, match):
-    g = match.group
-    empty = match.string[:0]
-    groups, literals = template
-    literals = literals[:]
-    try:
-        for index, group in groups:
-            literals[index] = g(group) or empty
-    except IndexError:
-        raise error("invalid group reference %d" % index) from None
-    return empty.join(literals)
+    addliteral()
+    return result
author	Serhiy Storchaka <storchaka@gmail.com>	2022-10-23 22:57:30 (GMT)
committer	GitHub <noreply@github.com>	2022-10-23 22:57:30 (GMT)
commit	75a6fadf369315b27e12f670e6295cf2c2cf7d7e (patch)
tree	cbf2004f870017f90874dc721e928da9412bab50 /Lib/re
parent	176b6c57be70fb70fd0563813a87822545eb4bbf (diff)
download	cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.zip cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.gz cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.bz2