diff options
author | Ashwin Ramaswami <aramaswamis@gmail.com> | 2019-09-03 16:42:53 (GMT) |
---|---|---|
committer | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2019-09-03 16:42:53 (GMT) |
commit | ea21389dda401457198fb214aa2c981a45ed9528 (patch) | |
tree | fa4232b8646ea424f1411e18fe22f64061bef9c5 /Lib/email/_header_value_parser.py | |
parent | 48058050cee5f6600150392100c162f223b4317f (diff) | |
download | cpython-ea21389dda401457198fb214aa2c981a45ed9528.zip cpython-ea21389dda401457198fb214aa2c981a45ed9528.tar.gz cpython-ea21389dda401457198fb214aa2c981a45ed9528.tar.bz2 |
[3.7] bpo-37764: Fix infinite loop when parsing unstructured email headers. (GH-15239) (GH-15654)
…aders. (GH-15239)
Fixes a case in which email._header_value_parser.get_unstructured hangs the system for some invalid headers. This covers the cases in which the header contains either:
- a case without trailing whitespace
- an invalid encoded word
https://bugs.python.org/issue37764
This fix should also be backported to 3.7 and 3.8
https://bugs.python.org/issue37764
(cherry picked from commit c5b242f87f31286ad38991bc3868cf4cfbf2b681)
Co-authored-by: Ashwin Ramaswami <aramaswamis@gmail.com>
https://bugs.python.org/issue37764
Diffstat (limited to 'Lib/email/_header_value_parser.py')
-rw-r--r-- | Lib/email/_header_value_parser.py | 19 |
1 files changed, 16 insertions, 3 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 2306879..7cc9a46 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -933,6 +933,10 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal): return '' +class _InvalidEwError(errors.HeaderParseError): + """Invalid encoded word found while parsing headers.""" + + # XXX these need to become classes and used as instances so # that a program can't change them in a parse tree and screw # up other parse trees. Maybe should have tests for that, too. @@ -1037,7 +1041,10 @@ def get_encoded_word(value): raise errors.HeaderParseError( "expected encoded word but found {}".format(value)) remstr = ''.join(remainder) - if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits: + if (len(remstr) > 1 and + remstr[0] in hexdigits and + remstr[1] in hexdigits and + tok.count('?') < 2): # The ? after the CTE was followed by an encoded word escape (=XX). rest, *remainder = remstr.split('?=', 1) tok = tok + '?=' + rest @@ -1049,7 +1056,7 @@ def get_encoded_word(value): try: text, charset, lang, defects = _ew.decode('=?' + tok + '?=') except ValueError: - raise errors.HeaderParseError( + raise _InvalidEwError( "encoded word format invalid: '{}'".format(ew.cte)) ew.charset = charset ew.lang = lang @@ -1099,9 +1106,12 @@ def get_unstructured(value): token, value = get_fws(value) unstructured.append(token) continue + valid_ew = True if value.startswith('=?'): try: token, value = get_encoded_word(value) + except _InvalidEwError: + valid_ew = False except errors.HeaderParseError: # XXX: Need to figure out how to register defects when # appropriate here. @@ -1123,7 +1133,10 @@ def get_unstructured(value): # Split in the middle of an atom if there is a rfc2047 encoded word # which does not have WSP on both sides. The defect will be registered # the next time through the loop. - if rfc2047_matcher.search(tok): + # This needs to only be performed when the encoded word is valid; + # otherwise, performing it on an invalid encoded word can cause + # the parser to go in an infinite loop. + if valid_ew and rfc2047_matcher.search(tok): tok, *remainder = value.partition('=?') vtext = ValueTerminal(tok, 'vtext') _validate_xtext(vtext) |