[3.8] bpo-21315: Fix parsing of encoded words with missing leading ws (GH-13425) (GH-15655)

* [bpo-21315](https://bugs.python.org/issue21315): Fix parsing of encoded words with missing leading ws. Because of missing leading whitespace, encoded word would get parsed as unstructured token. This patch fixes that by looking for encoded words when splitting tokens with whitespace. Missing trailing whitespace around encoded word now register a defect instead. Original patch suggestion by David R. Murray on [bpo-21315](https://bugs.python.org/issue21315). (cherry picked from commit 66c4f3f38b867d8329b28c032bb907fd1a2f22d2) Co-authored-by: Abhilash Raj <maxking@users.noreply.github.com> (cherry picked from commit dc20fc4311dece19488299a7cd11317ffbe4d3c3) Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> https://bugs.python.org/issue21315
author: Ashwin Ramaswami <aramaswamis@gmail.com> 2019-09-03 17:08:39 (GMT)
committer: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> 2019-09-03 17:08:39 (GMT)
commit: 59e8fba7189d0e86d428a1125744afb8b0f40b5d (patch)
tree: dd7c247e36f0684255552d7b06a7ca955b041394 /Lib/email
parent: 58067d2cf6e81187f9782aff03cc8bec3d878778 (diff)
download: cpython-59e8fba7189d0e86d428a1125744afb8b0f40b5d.zip
cpython-59e8fba7189d0e86d428a1125744afb8b0f40b5d.tar.gz
cpython-59e8fba7189d0e86d428a1125744afb8b0f40b5d.tar.bz2
1 files changed, 21 insertions, 0 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index ea7083f..b500394 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
 
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
 #
 # TokenList and its subclasses
 #
@@ -1054,6 +1066,10 @@ def get_encoded_word(value):
         _validate_xtext(vtext)
         ew.append(vtext)
         text = ''.join(remainder)
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
     return ew, value
 
 def get_unstructured(value):
@@ -1106,6 +1122,11 @@ def get_unstructured(value):
                 unstructured.append(token)
                 continue
         tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
         vtext = ValueTerminal(tok, 'vtext')
         _validate_xtext(vtext)
         unstructured.append(vtext)
author	Ashwin Ramaswami <aramaswamis@gmail.com>	2019-09-03 17:08:39 (GMT)
committer	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>	2019-09-03 17:08:39 (GMT)
commit	59e8fba7189d0e86d428a1125744afb8b0f40b5d (patch)
tree	dd7c247e36f0684255552d7b06a7ca955b041394 /Lib/email
parent	58067d2cf6e81187f9782aff03cc8bec3d878778 (diff)
download	cpython-59e8fba7189d0e86d428a1125744afb8b0f40b5d.zip cpython-59e8fba7189d0e86d428a1125744afb8b0f40b5d.tar.gz cpython-59e8fba7189d0e86d428a1125744afb8b0f40b5d.tar.bz2