From 7e6ce48872fa3de98c986057764f35e1b2f4b936 Mon Sep 17 00:00:00 2001 From: CF Bolz-Tereick Date: Thu, 13 Jul 2023 08:12:56 +0200 Subject: gh-106628: email parsing speedup (gh-106629) --- Lib/email/feedparser.py | 15 +++++++++------ .../2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst | 2 ++ 2 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 885097c..53d71f5 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -37,6 +37,8 @@ NLCRE_crack = re.compile(r'(\r\n|\r|\n)') headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') EMPTYSTRING = '' NL = '\n' +boundaryendRE = re.compile( + r'(?P--)?(?P[ \t]*)(?P\r\n|\r|\n)?$') NeedMoreData = object() @@ -327,9 +329,10 @@ class FeedParser: # this onto the input stream until we've scanned past the # preamble. separator = '--' + boundary - boundaryre = re.compile( - '(?P' + re.escape(separator) + - r')(?P--)?(?P[ \t]*)(?P\r\n|\r|\n)?$') + def boundarymatch(line): + if not line.startswith(separator): + return None + return boundaryendRE.match(line, len(separator)) capturing_preamble = True preamble = [] linesep = False @@ -341,7 +344,7 @@ class FeedParser: continue if line == '': break - mo = boundaryre.match(line) + mo = boundarymatch(line) if mo: # If we're looking at the end boundary, we're done with # this multipart. If there was a newline at the end of @@ -373,13 +376,13 @@ class FeedParser: if line is NeedMoreData: yield NeedMoreData continue - mo = boundaryre.match(line) + mo = boundarymatch(line) if not mo: self._input.unreadline(line) break # Recurse to parse this subpart; the input stream points # at the subpart's first line. - self._input.push_eof_matcher(boundaryre.match) + self._input.push_eof_matcher(boundarymatch) for retval in self._parsegen(): if retval is NeedMoreData: yield NeedMoreData diff --git a/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst new file mode 100644 index 0000000..6fa276e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst @@ -0,0 +1,2 @@ +Speed up parsing of emails by about 20% by not compiling a new regular +expression for every single email. -- cgit v0.12