diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-10-23 22:57:30 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-23 22:57:30 (GMT) |
commit | 75a6fadf369315b27e12f670e6295cf2c2cf7d7e (patch) | |
tree | cbf2004f870017f90874dc721e928da9412bab50 /Lib/re/_parser.py | |
parent | 176b6c57be70fb70fd0563813a87822545eb4bbf (diff) | |
download | cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.zip cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.gz cpython-75a6fadf369315b27e12f670e6295cf2c2cf7d7e.tar.bz2 |
gh-91524: Speed up the regular expression substitution (#91525)
Functions re.sub() and re.subn() and corresponding re.Pattern methods
are now 2-3 times faster for replacement strings containing group references.
Closes #91524
Primarily authored by serhiy-storchaka Serhiy Storchaka
Minor-cleanups-by: Gregory P. Smith [Google] <greg@krypto.org>
Diffstat (limited to 'Lib/re/_parser.py')
-rw-r--r-- | Lib/re/_parser.py | 45 |
1 files changed, 16 insertions, 29 deletions
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 0d9cf63..5709acb 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -984,24 +984,28 @@ def parse(str, flags=0, state=None): return p -def parse_template(source, state): +def parse_template(source, pattern): # parse 're' replacement string into list of literals and # group references s = Tokenizer(source) sget = s.get - groups = [] - literals = [] + result = [] literal = [] lappend = literal.append + def addliteral(): + if s.istext: + result.append(''.join(literal)) + else: + # The tokenizer implicitly decodes bytes objects as latin-1, we must + # therefore re-encode the final representation. + result.append(''.join(literal).encode('latin-1')) + del literal[:] def addgroup(index, pos): - if index > state.groups: + if index > pattern.groups: raise s.error("invalid group reference %d" % index, pos) - if literal: - literals.append(''.join(literal)) - del literal[:] - groups.append((len(literals), index)) - literals.append(None) - groupindex = state.groupindex + addliteral() + result.append(index) + groupindex = pattern.groupindex while True: this = sget() if this is None: @@ -1063,22 +1067,5 @@ def parse_template(source, state): lappend(this) else: lappend(this) - if literal: - literals.append(''.join(literal)) - if not isinstance(source, str): - # The tokenizer implicitly decodes bytes objects as latin-1, we must - # therefore re-encode the final representation. - literals = [None if s is None else s.encode('latin-1') for s in literals] - return groups, literals - -def expand_template(template, match): - g = match.group - empty = match.string[:0] - groups, literals = template - literals = literals[:] - try: - for index, group in groups: - literals[index] = g(group) or empty - except IndexError: - raise error("invalid group reference %d" % index) from None - return empty.join(literals) + addliteral() + return result |