gh-91524: Speed up the regular expression substitution (#91525)

Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. Closes #91524 Primarily authored by serhiy-storchaka Serhiy Storchaka Minor-cleanups-by: Gregory P. Smith [Google] <greg@krypto.org>
author: Serhiy Storchaka <storchaka@gmail.com> 2022-10-23 22:57:30 (GMT)
committer: GitHub <noreply@github.com> 2022-10-23 22:57:30 (GMT)
commit: 75a6fadf369315b27e12f670e6295cf2c2cf7d7e (patch)
tree: cbf2004f870017f90874dc721e928da9412bab50 /Lib/re/_parser.py
parent: 176b6c57be70fb70fd0563813a87822545eb4bbf (diff)
download: cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.zip
cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.gz
cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.bz2
1 files changed, 16 insertions, 29 deletions
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index 0d9cf63..5709acb 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):
 
     return p
 
-def parse_template(source, state):
+def parse_template(source, pattern):
     # parse 're' replacement string into list of literals and
     # group references
     s = Tokenizer(source)
     sget = s.get
-    groups = []
-    literals = []
+    result = []
     literal = []
     lappend = literal.append
+    def addliteral():
+        if s.istext:
+            result.append(''.join(literal))
+        else:
+            # The tokenizer implicitly decodes bytes objects as latin-1, we must
+            # therefore re-encode the final representation.
+            result.append(''.join(literal).encode('latin-1'))
+        del literal[:]
     def addgroup(index, pos):
-        if index > state.groups:
+        if index > pattern.groups:
             raise s.error("invalid group reference %d" % index, pos)
-        if literal:
-            literals.append(''.join(literal))
-            del literal[:]
-        groups.append((len(literals), index))
-        literals.append(None)
-    groupindex = state.groupindex
+        addliteral()
+        result.append(index)
+    groupindex = pattern.groupindex
     while True:
         this = sget()
         if this is None:
@@ -1063,22 +1067,5 @@ def parse_template(source, state):
                 lappend(this)
         else:
             lappend(this)
-    if literal:
-        literals.append(''.join(literal))
-    if not isinstance(source, str):
-        # The tokenizer implicitly decodes bytes objects as latin-1, we must
-        # therefore re-encode the final representation.
-        literals = [None if s is None else s.encode('latin-1') for s in literals]
-    return groups, literals
-
-def expand_template(template, match):
-    g = match.group
-    empty = match.string[:0]
-    groups, literals = template
-    literals = literals[:]
-    try:
-        for index, group in groups:
-            literals[index] = g(group) or empty
-    except IndexError:
-        raise error("invalid group reference %d" % index) from None
-    return empty.join(literals)
+    addliteral()
+    return result
author	Serhiy Storchaka <storchaka@gmail.com>	2022-10-23 22:57:30 (GMT)
committer	GitHub <noreply@github.com>	2022-10-23 22:57:30 (GMT)
commit	75a6fadf369315b27e12f670e6295cf2c2cf7d7e (patch)
tree	cbf2004f870017f90874dc721e928da9412bab50 /Lib/re/_parser.py
parent	176b6c57be70fb70fd0563813a87822545eb4bbf (diff)
download	cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.zip cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.gz cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.bz2