bpo-21315: Fix parsing of encoded words with missing leading ws. (#13425)

* bpo-21315: Fix parsing of encoded words with missing leading ws. Because of missing leading whitespace, encoded word would get parsed as unstructured token. This patch fixes that by looking for encoded words when splitting tokens with whitespace. Missing trailing whitespace around encoded word now register a defect instead. Original patch suggestion by David R. Murray on bpo-21315.
author: Abhilash Raj <maxking@users.noreply.github.com> 2019-06-05 16:56:33 (GMT)
committer: Barry Warsaw <barry@python.org> 2019-06-05 16:56:33 (GMT)
commit: 66c4f3f38b867d8329b28c032bb907fd1a2f22d2 (patch)
tree: bed2fe319cf50ffbc6d2ac2c89d5f167df4e9d48
parent: 142566c028720934325f0b7fe28680afd046e00f (diff)
download: cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.zip
cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.tar.gz
cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.tar.bz2
4 files changed, 49 insertions, 3 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 34969ab..35d746a 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
 
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
 #
 # TokenList and its subclasses
 #
@@ -1052,6 +1064,10 @@ def get_encoded_word(value):
         _validate_xtext(vtext)
         ew.append(vtext)
         text = ''.join(remainder)
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
     return ew, value
 
 def get_unstructured(value):
@@ -1104,6 +1120,11 @@ def get_unstructured(value):
                 unstructured.append(token)
                 continue
         tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
         vtext = ValueTerminal(tok, 'vtext')
         _validate_xtext(vtext)
         unstructured.append(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 12da3cf..649923f 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase):
                          '=?us-ascii?q?first?==?utf-8?q?second?=',
                          'first',
                          'first',
-                         [],
+                         [errors.InvalidHeaderDefect],
                          '=?utf-8?q?second?=')
 
     def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase):
             '=?utf-8?q?foo?==?utf-8?q?bar?=',
             'foobar',
             'foobar',
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_leading_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            'nowhitespace=?utf-8?q?somevalue?=',
+            'nowhitespacesomevalue',
+            'nowhitespacesomevalue',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_trailing_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            '=?utf-8?q?somevalue?=nowhitespace',
+            'somevaluenowhitespace',
+            'somevaluenowhitespace',
             [errors.InvalidHeaderDefect],
             '')
 
@@ -546,7 +565,8 @@ class TestParser(TestParserMixin, TestEmailBase):
             '"=?utf-8?Q?not_really_valid?="',
             '"not really valid"',
             'not really valid',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+             errors.InvalidHeaderDefect],
             '')
 
     # get_comment
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index 7550546..5d9b357 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
 
         'rfc2047_atom_in_quoted_string_is_decoded':
             ('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
             'Éric <foo@example.com>',
             'Éric',
             'foo@example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
new file mode 100644
index 0000000..dd0dd7f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -0,0 +1,4 @@
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+
author	Abhilash Raj <maxking@users.noreply.github.com>	2019-06-05 16:56:33 (GMT)
committer	Barry Warsaw <barry@python.org>	2019-06-05 16:56:33 (GMT)
commit	66c4f3f38b867d8329b28c032bb907fd1a2f22d2 (patch)
tree	bed2fe319cf50ffbc6d2ac2c89d5f167df4e9d48
parent	142566c028720934325f0b7fe28680afd046e00f (diff)
download	cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.zip cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.tar.gz cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.tar.bz2