summaryrefslogtreecommitdiffstats
path: root/Lib/email
diff options
context:
space:
mode:
authorAbhilash Raj <maxking@users.noreply.github.com>2019-06-05 16:56:33 (GMT)
committerBarry Warsaw <barry@python.org>2019-06-05 16:56:33 (GMT)
commit66c4f3f38b867d8329b28c032bb907fd1a2f22d2 (patch)
treebed2fe319cf50ffbc6d2ac2c89d5f167df4e9d48 /Lib/email
parent142566c028720934325f0b7fe28680afd046e00f (diff)
downloadcpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.zip
cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.tar.gz
cpython-66c4f3f38b867d8329b28c032bb907fd1a2f22d2.tar.bz2
bpo-21315: Fix parsing of encoded words with missing leading ws. (#13425)
* bpo-21315: Fix parsing of encoded words with missing leading ws. Because of missing leading whitespace, encoded word would get parsed as unstructured token. This patch fixes that by looking for encoded words when splitting tokens with whitespace. Missing trailing whitespace around encoded word now register a defect instead. Original patch suggestion by David R. Murray on bpo-21315.
Diffstat (limited to 'Lib/email')
-rw-r--r--Lib/email/_header_value_parser.py21
1 files changed, 21 insertions, 0 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 34969ab..35d746a 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+ =\? # literal =?
+ [^?]* # charset
+ \? # literal ?
+ [qQbB] # literal 'q' or 'b', case insensitive
+ \? # literal ?
+ .*? # encoded word
+ \?= # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
#
# TokenList and its subclasses
#
@@ -1052,6 +1064,10 @@ def get_encoded_word(value):
_validate_xtext(vtext)
ew.append(vtext)
text = ''.join(remainder)
+ # Encoded words should be followed by a WS
+ if value and value[0] not in WSP:
+ ew.defects.append(errors.InvalidHeaderDefect(
+ "missing trailing whitespace after encoded-word"))
return ew, value
def get_unstructured(value):
@@ -1104,6 +1120,11 @@ def get_unstructured(value):
unstructured.append(token)
continue
tok, *remainder = _wsp_splitter(value, 1)
+ # Split in the middle of an atom if there is a rfc2047 encoded word
+ # which does not have WSP on both sides. The defect will be registered
+ # the next time through the loop.
+ if rfc2047_matcher.search(tok):
+ tok, *remainder = value.partition('=?')
vtext = ValueTerminal(tok, 'vtext')
_validate_xtext(vtext)
unstructured.append(vtext)