summaryrefslogtreecommitdiffstats
path: root/Lib/sre_parse.py
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2013-10-23 19:27:52 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2013-10-23 19:27:52 (GMT)
commit9c15ec1ce6a8353b50819baa843c144572807401 (patch)
treeaca9ff89020c0e00f9a22c3549bf2f81db6bd012 /Lib/sre_parse.py
parent4d397008cdbcc653ff69fc9a39d29e2aaae07512 (diff)
downloadcpython-9c15ec1ce6a8353b50819baa843c144572807401.zip
cpython-9c15ec1ce6a8353b50819baa843c144572807401.tar.gz
cpython-9c15ec1ce6a8353b50819baa843c144572807401.tar.bz2
Issue #19365: Optimized the parsing of long replacement string in re.sub*()
functions.
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r--Lib/sre_parse.py74
1 files changed, 30 insertions, 44 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index f26229f..6583ef6 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -769,35 +769,33 @@ def parse_template(source, pattern):
# group references
s = Tokenizer(source)
sget = s.get
- p = []
- a = p.append
- def literal(literal, p=p, pappend=a):
- if p and p[-1][0] is LITERAL:
- p[-1] = LITERAL, p[-1][1] + literal
- else:
- pappend((LITERAL, literal))
- sep = source[:0]
- if isinstance(sep, str):
- makechar = chr
- else:
- makechar = chr
- while 1:
+ groups = []
+ literals = []
+ literal = []
+ lappend = literal.append
+ def addgroup(index):
+ if literal:
+ literals.append(''.join(literal))
+ del literal[:]
+ groups.append((len(literals), index))
+ literals.append(None)
+ while True:
this = sget()
if this is None:
break # end of replacement string
- if this and this[0] == "\\":
+ if this[0] == "\\":
# group
- c = this[1:2]
+ c = this[1]
if c == "g":
name = ""
if s.match("<"):
- while 1:
+ while True:
char = sget()
if char is None:
raise error("unterminated group name")
if char == ">":
break
- name = name + char
+ name += char
if not name:
raise error("missing group name")
try:
@@ -811,50 +809,38 @@ def parse_template(source, pattern):
index = pattern.groupindex[name]
except KeyError:
raise IndexError("unknown group name")
- a((MARK, index))
+ addgroup(index)
elif c == "0":
if s.next in OCTDIGITS:
- this = this + sget()
+ this += sget()
if s.next in OCTDIGITS:
- this = this + sget()
- literal(makechar(int(this[1:], 8) & 0xff))
+ this += sget()
+ lappend(chr(int(this[1:], 8) & 0xff))
elif c in DIGITS:
isoctal = False
if s.next in DIGITS:
- this = this + sget()
+ this += sget()
if (c in OCTDIGITS and this[2] in OCTDIGITS and
s.next in OCTDIGITS):
- this = this + sget()
+ this += sget()
isoctal = True
- literal(makechar(int(this[1:], 8) & 0xff))
+ lappend(chr(int(this[1:], 8) & 0xff))
if not isoctal:
- a((MARK, int(this[1:])))
+ addgroup(int(this[1:]))
else:
try:
- this = makechar(ESCAPES[this][1])
+ this = chr(ESCAPES[this][1])
except KeyError:
pass
- literal(this)
+ lappend(this)
else:
- literal(this)
- # convert template to groups and literals lists
- i = 0
- groups = []
- groupsappend = groups.append
- literals = [None] * len(p)
- if isinstance(source, str):
- encode = lambda x: x
- else:
+ lappend(this)
+ if literal:
+ literals.append(''.join(literal))
+ if not isinstance(source, str):
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
- encode = lambda x: x.encode('latin-1')
- for c, s in p:
- if c is MARK:
- groupsappend((i, s))
- # literal[i] is already None
- else:
- literals[i] = encode(s)
- i = i + 1
+ literals = [None if s is None else s.encode('latin-1') for s in literals]
return groups, literals
def expand_template(template, match):