summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorR David Murray <rdmurray@bitdance.com>2013-07-11 19:58:07 (GMT)
committerR David Murray <rdmurray@bitdance.com>2013-07-11 19:58:07 (GMT)
commit63194a774e47ab5c7a10a693ef53188cf24dce16 (patch)
treeb2a6fa5fb2c9057a908488bb150ba12e43ab5701 /Lib
parentf9e6672ae8044f9dbcbafe98a6b63ab30189770e (diff)
parent65171b28e77f589a490335c8749a24151e1d8817 (diff)
downloadcpython-63194a774e47ab5c7a10a693ef53188cf24dce16.zip
cpython-63194a774e47ab5c7a10a693ef53188cf24dce16.tar.gz
cpython-63194a774e47ab5c7a10a693ef53188cf24dce16.tar.bz2
Merge: #18044: Fix parsing of encoded words of the form =?utf8?q?=XX...?=
Diffstat (limited to 'Lib')
-rw-r--r--Lib/email/_header_value_parser.py43
-rw-r--r--Lib/test/test_email/test__encoded_words.py5
-rw-r--r--Lib/test/test_email/test__header_value_parser.py9
-rw-r--r--Lib/test/test_email/test_headerregistry.py41
4 files changed, 58 insertions, 40 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index eb31558..32fc06e 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -69,6 +69,7 @@ XXX: provide complete list of token types.
import re
import urllib # For urllib.parse.unquote
+from string import hexdigits
from collections import namedtuple, OrderedDict
from email import _encoded_words as _ew
from email import errors
@@ -391,10 +392,6 @@ class UnstructuredTokenList(TokenList):
token_type = 'unstructured'
def _fold(self, folded):
- if any(x.token_type=='encoded-word' for x in self):
- return self._fold_encoded(folded)
- # Here we can have either a pure ASCII string that may or may not
- # have surrogateescape encoded bytes, or a unicode string.
last_ew = None
for part in self.parts:
tstr = str(part)
@@ -1386,35 +1383,6 @@ def _get_ptext_to_endchars(value, endchars):
pos = pos + 1
return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
-def _decode_ew_run(value):
- """ Decode a run of RFC2047 encoded words.
-
- _decode_ew_run(value) -> (text, value, defects)
-
- Scans the supplied value for a run of tokens that look like they are RFC
- 2047 encoded words, decodes those words into text according to RFC 2047
- rules (whitespace between encoded words is discarded), and returns the text
- and the remaining value (including any leading whitespace on the remaining
- value), as well as a list of any defects encountered while decoding. The
- input value may not have any leading whitespace.
-
- """
- res = []
- defects = []
- last_ws = ''
- while value:
- try:
- tok, ws, value = _wsp_splitter(value, 1)
- except ValueError:
- tok, ws, value = value, '', ''
- if not (tok.startswith('=?') and tok.endswith('?=')):
- return ''.join(res), last_ws + tok + ws + value, defects
- text, charset, lang, new_defects = _ew.decode(tok)
- res.append(text)
- defects.extend(new_defects)
- last_ws = ws
- return ''.join(res), last_ws, defects
-
def get_fws(value):
"""FWS = 1*WSP
@@ -1440,7 +1408,8 @@ def get_encoded_word(value):
raise errors.HeaderParseError(
"expected encoded word but found {}".format(value))
remstr = ''.join(remainder)
- if remstr[:2].isdigit():
+ if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
+ # The ? after the CTE was followed by an encoded word escape (=XX).
rest, *remainder = remstr.split('?=', 1)
tok = tok + '?=' + rest
if len(tok.split()) > 1:
@@ -1488,8 +1457,8 @@ def get_unstructured(value):
"""
# XXX: but what about bare CR and LF? They might signal the start or
- # end of an encoded word. YAGNI for now, since out current parsers
- # will never send us strings with bard CR or LF.
+ # end of an encoded word. YAGNI for now, since our current parsers
+ # will never send us strings with bare CR or LF.
unstructured = UnstructuredTokenList()
while value:
@@ -1501,6 +1470,8 @@ def get_unstructured(value):
try:
token, value = get_encoded_word(value)
except errors.HeaderParseError:
+ # XXX: Need to figure out how to register defects when
+ # appropriate here.
pass
else:
have_ws = True
diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py
index 14395fe..f8e380d 100644
--- a/Lib/test/test_email/test__encoded_words.py
+++ b/Lib/test/test_email/test__encoded_words.py
@@ -122,6 +122,11 @@ class TestDecode(TestEmailBase):
# XXX Should this be a new Defect instead?
defects = [errors.CharsetError])
+ def test_q_nonascii(self):
+ self._test('=?utf-8?q?=C3=89ric?=',
+ 'Éric',
+ charset='utf-8')
+
class TestEncodeQ(TestEmailBase):
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 6101e19..8917447 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -170,6 +170,15 @@ class TestParser(TestParserMixin, TestEmailBase):
[],
'')
+ def test_get_encoded_word_quopri_utf_escape_follows_cte(self):
+ # Issue 18044
+ self._test_get_x(parser.get_encoded_word,
+ '=?utf-8?q?=C3=89ric?=',
+ 'Éric',
+ 'Éric',
+ [],
+ '')
+
# get_unstructured
def _get_unst(self, value):
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index c0c81c1..80f1c02 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -123,12 +123,45 @@ class TestBaseHeaderFeatures(TestHeaderBase):
# self.assertEqual(h, value)
# self.assertDefectsEqual(h.defects, [errors.ObsoleteHeaderDefect])
- def test_RFC2047_value_decoded(self):
- value = '=?utf-8?q?this_is_a_test?='
- h = self.make_header('subject', value)
- self.assertEqual(h, 'this is a test')
+@parameterize
+class TestUnstructuredHeader(TestHeaderBase):
+ def string_as_value(self,
+ source,
+ decoded,
+ *args):
+ l = len(args)
+ defects = args[0] if l>0 else []
+ header = 'Subject:' + (' ' if source else '')
+ folded = header + (args[1] if l>1 else source) + '\n'
+ h = self.make_header('Subject', source)
+ self.assertEqual(h, decoded)
+ self.assertDefectsEqual(h.defects, defects)
+ self.assertEqual(h.fold(policy=policy.default), folded)
+
+ string_params = {
+
+ 'rfc2047_simple_quopri': (
+ '=?utf-8?q?this_is_a_test?=',
+ 'this is a test',
+ [],
+ 'this is a test'),
+
+ 'rfc2047_gb2312_base64': (
+ '=?gb2312?b?1eLKx9bQzsSy4srUo6E=?=',
+ '\u8fd9\u662f\u4e2d\u6587\u6d4b\u8bd5\uff01',
+ [],
+ '=?utf-8?b?6L+Z5piv5Lit5paH5rWL6K+V77yB?='),
+
+ 'rfc2047_simple_nonascii_quopri': (
+ '=?utf-8?q?=C3=89ric?=',
+ 'Éric'),
+
+ }
+
+
+@parameterize
class TestDateHeader(TestHeaderBase):
datestring = 'Sun, 23 Sep 2001 20:10:55 -0700'