summaryrefslogtreecommitdiffstats
path: root/Lib/re
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2022-10-23 22:57:30 (GMT)
committerGitHub <noreply@github.com>2022-10-23 22:57:30 (GMT)
commit75a6fadf369315b27e12f670e6295cf2c2cf7d7e (patch)
treecbf2004f870017f90874dc721e928da9412bab50 /Lib/re
parent176b6c57be70fb70fd0563813a87822545eb4bbf (diff)
downloadcpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.zip
cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.gz
cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.bz2
gh-91524: Speed up the regular expression substitution (#91525)
Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. Closes #91524 Primarily authored by serhiy-storchaka Serhiy Storchaka Minor-cleanups-by: Gregory P. Smith [Google] <greg@krypto.org>
Diffstat (limited to 'Lib/re')
-rw-r--r--Lib/re/__init__.py22
-rw-r--r--Lib/re/_constants.py2
-rw-r--r--Lib/re/_parser.py45
3 files changed, 21 insertions, 48 deletions
diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py
index 8d6a4ef..4515650 100644
--- a/Lib/re/__init__.py
+++ b/Lib/re/__init__.py
@@ -124,6 +124,7 @@ This module also defines an exception 'error'.
import enum
from . import _compiler, _parser
import functools
+import _sre
# public symbols
@@ -230,7 +231,7 @@ def purge():
"Clear the regular expression caches"
_cache.clear()
_cache2.clear()
- _compile_repl.cache_clear()
+ _compile_template.cache_clear()
def template(pattern, flags=0):
"Compile a template pattern, returning a Pattern object, deprecated"
@@ -328,24 +329,9 @@ def _compile(pattern, flags):
return p
@functools.lru_cache(_MAXCACHE)
-def _compile_repl(repl, pattern):
+def _compile_template(pattern, repl):
# internal: compile replacement pattern
- return _parser.parse_template(repl, pattern)
-
-def _expand(pattern, match, template):
- # internal: Match.expand implementation hook
- template = _parser.parse_template(template, pattern)
- return _parser.expand_template(template, match)
-
-def _subx(pattern, template):
- # internal: Pattern.sub/subn implementation helper
- template = _compile_repl(template, pattern)
- if not template[0] and len(template[1]) == 1:
- # literal replacement
- return template[1][0]
- def filter(match, template=template):
- return _parser.expand_template(template, match)
- return filter
+ return _sre.template(pattern, _parser.parse_template(repl, pattern))
# register myself for pickling
diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py
index 10ee14b..d8718d3 100644
--- a/Lib/re/_constants.py
+++ b/Lib/re/_constants.py
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20220615
+MAGIC = 20221023
from _sre import MAXREPEAT, MAXGROUPS
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index 0d9cf63..5709acb 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):
return p
-def parse_template(source, state):
+def parse_template(source, pattern):
# parse 're' replacement string into list of literals and
# group references
s = Tokenizer(source)
sget = s.get
- groups = []
- literals = []
+ result = []
literal = []
lappend = literal.append
+ def addliteral():
+ if s.istext:
+ result.append(''.join(literal))
+ else:
+ # The tokenizer implicitly decodes bytes objects as latin-1, we must
+ # therefore re-encode the final representation.
+ result.append(''.join(literal).encode('latin-1'))
+ del literal[:]
def addgroup(index, pos):
- if index > state.groups:
+ if index > pattern.groups:
raise s.error("invalid group reference %d" % index, pos)
- if literal:
- literals.append(''.join(literal))
- del literal[:]
- groups.append((len(literals), index))
- literals.append(None)
- groupindex = state.groupindex
+ addliteral()
+ result.append(index)
+ groupindex = pattern.groupindex
while True:
this = sget()
if this is None:
@@ -1063,22 +1067,5 @@ def parse_template(source, state):
lappend(this)
else:
lappend(this)
- if literal:
- literals.append(''.join(literal))
- if not isinstance(source, str):
- # The tokenizer implicitly decodes bytes objects as latin-1, we must
- # therefore re-encode the final representation.
- literals = [None if s is None else s.encode('latin-1') for s in literals]
- return groups, literals
-
-def expand_template(template, match):
- g = match.group
- empty = match.string[:0]
- groups, literals = template
- literals = literals[:]
- try:
- for index, group in groups:
- literals[index] = g(group) or empty
- except IndexError:
- raise error("invalid group reference %d" % index) from None
- return empty.join(literals)
+ addliteral()
+ return result