summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCF Bolz-Tereick <cfbolz@gmx.de>2023-07-13 06:12:56 (GMT)
committerGitHub <noreply@github.com>2023-07-13 06:12:56 (GMT)
commit7e6ce48872fa3de98c986057764f35e1b2f4b936 (patch)
tree340b37ca60b5d12c2a25be0786a3c894c3c03c9d
parentaf51bd7cda9c0cba149b882c1e501765595e5fc3 (diff)
downloadcpython-7e6ce48872fa3de98c986057764f35e1b2f4b936.zip
cpython-7e6ce48872fa3de98c986057764f35e1b2f4b936.tar.gz
cpython-7e6ce48872fa3de98c986057764f35e1b2f4b936.tar.bz2
gh-106628: email parsing speedup (gh-106629)
-rw-r--r--Lib/email/feedparser.py15
-rw-r--r--Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst2
2 files changed, 11 insertions, 6 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 885097c..53d71f5 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -37,6 +37,8 @@ NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
EMPTYSTRING = ''
NL = '\n'
+boundaryendRE = re.compile(
+ r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
NeedMoreData = object()
@@ -327,9 +329,10 @@ class FeedParser:
# this onto the input stream until we've scanned past the
# preamble.
separator = '--' + boundary
- boundaryre = re.compile(
- '(?P<sep>' + re.escape(separator) +
- r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
+ def boundarymatch(line):
+ if not line.startswith(separator):
+ return None
+ return boundaryendRE.match(line, len(separator))
capturing_preamble = True
preamble = []
linesep = False
@@ -341,7 +344,7 @@ class FeedParser:
continue
if line == '':
break
- mo = boundaryre.match(line)
+ mo = boundarymatch(line)
if mo:
# If we're looking at the end boundary, we're done with
# this multipart. If there was a newline at the end of
@@ -373,13 +376,13 @@ class FeedParser:
if line is NeedMoreData:
yield NeedMoreData
continue
- mo = boundaryre.match(line)
+ mo = boundarymatch(line)
if not mo:
self._input.unreadline(line)
break
# Recurse to parse this subpart; the input stream points
# at the subpart's first line.
- self._input.push_eof_matcher(boundaryre.match)
+ self._input.push_eof_matcher(boundarymatch)
for retval in self._parsegen():
if retval is NeedMoreData:
yield NeedMoreData
diff --git a/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
new file mode 100644
index 0000000..6fa276e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
@@ -0,0 +1,2 @@
+Speed up parsing of emails by about 20% by not compiling a new regular
+expression for every single email.