diff options
author | CF Bolz-Tereick <cfbolz@gmx.de> | 2023-07-13 06:12:56 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-13 06:12:56 (GMT) |
commit | 7e6ce48872fa3de98c986057764f35e1b2f4b936 (patch) | |
tree | 340b37ca60b5d12c2a25be0786a3c894c3c03c9d | |
parent | af51bd7cda9c0cba149b882c1e501765595e5fc3 (diff) | |
download | cpython-7e6ce48872fa3de98c986057764f35e1b2f4b936.zip cpython-7e6ce48872fa3de98c986057764f35e1b2f4b936.tar.gz cpython-7e6ce48872fa3de98c986057764f35e1b2f4b936.tar.bz2 |
gh-106628: email parsing speedup (gh-106629)
-rw-r--r-- | Lib/email/feedparser.py | 15 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst | 2 |
2 files changed, 11 insertions, 6 deletions
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 885097c..53d71f5 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -37,6 +37,8 @@ NLCRE_crack = re.compile(r'(\r\n|\r|\n)') headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') EMPTYSTRING = '' NL = '\n' +boundaryendRE = re.compile( + r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') NeedMoreData = object() @@ -327,9 +329,10 @@ class FeedParser: # this onto the input stream until we've scanned past the # preamble. separator = '--' + boundary - boundaryre = re.compile( - '(?P<sep>' + re.escape(separator) + - r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') + def boundarymatch(line): + if not line.startswith(separator): + return None + return boundaryendRE.match(line, len(separator)) capturing_preamble = True preamble = [] linesep = False @@ -341,7 +344,7 @@ class FeedParser: continue if line == '': break - mo = boundaryre.match(line) + mo = boundarymatch(line) if mo: # If we're looking at the end boundary, we're done with # this multipart. If there was a newline at the end of @@ -373,13 +376,13 @@ class FeedParser: if line is NeedMoreData: yield NeedMoreData continue - mo = boundaryre.match(line) + mo = boundarymatch(line) if not mo: self._input.unreadline(line) break # Recurse to parse this subpart; the input stream points # at the subpart's first line. - self._input.push_eof_matcher(boundaryre.match) + self._input.push_eof_matcher(boundarymatch) for retval in self._parsegen(): if retval is NeedMoreData: yield NeedMoreData diff --git a/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst new file mode 100644 index 0000000..6fa276e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst @@ -0,0 +1,2 @@ +Speed up parsing of emails by about 20% by not compiling a new regular +expression for every single email. |