summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/email/_encoded_words.py10
-rw-r--r--Lib/email/_header_value_parser.py2
-rw-r--r--Lib/test/test_email/test__encoded_words.py7
-rw-r--r--Lib/test/test_email/test_email.py9
-rw-r--r--Lib/test/test_email/test_headerregistry.py12
-rw-r--r--Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst2
6 files changed, 36 insertions, 6 deletions
diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py
index 295ae7e..6795a60 100644
--- a/Lib/email/_encoded_words.py
+++ b/Lib/email/_encoded_words.py
@@ -179,15 +179,15 @@ def decode(ew):
# Turn the CTE decoded bytes into unicode.
try:
string = bstring.decode(charset)
- except UnicodeError:
+ except UnicodeDecodeError:
defects.append(errors.UndecodableBytesDefect("Encoded word "
- "contains bytes not decodable using {} charset".format(charset)))
+ f"contains bytes not decodable using {charset!r} charset"))
string = bstring.decode(charset, 'surrogateescape')
- except LookupError:
+ except (LookupError, UnicodeEncodeError):
string = bstring.decode('ascii', 'surrogateescape')
if charset.lower() != 'unknown-8bit':
- defects.append(errors.CharsetError("Unknown charset {} "
- "in encoded word; decoded as unknown bytes".format(charset)))
+ defects.append(errors.CharsetError(f"Unknown charset {charset!r} "
+ f"in encoded word; decoded as unknown bytes"))
return string, charset, lang, defects
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 51d355f..8a8fb8b 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -781,7 +781,7 @@ class MimeParameters(TokenList):
else:
try:
value = value.decode(charset, 'surrogateescape')
- except LookupError:
+ except (LookupError, UnicodeEncodeError):
# XXX: there should really be a custom defect for
# unknown character set to make it easy to find,
# because otherwise unknown charset is a silent
diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py
index 0b8b1de..1713962 100644
--- a/Lib/test/test_email/test__encoded_words.py
+++ b/Lib/test/test_email/test__encoded_words.py
@@ -130,6 +130,13 @@ class TestDecode(TestEmailBase):
# XXX Should this be a new Defect instead?
defects = [errors.CharsetError])
+ def test_invalid_character_in_charset(self):
+ self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=',
+ b'foo\xacbar'.decode('ascii', 'surrogateescape'),
+ charset = 'utf-8\udce2\udc80\udc9d',
+ # XXX Should this be a new Defect instead?
+ defects = [errors.CharsetError])
+
def test_q_nonascii(self):
self._test('=?utf-8?q?=C3=89ric?=',
'Éric',
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py
index 933aa4c..69f883a 100644
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -5360,6 +5360,15 @@ Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(), 'myfile.txt')
+ def test_rfc2231_bad_character_in_encoding(self):
+ m = """\
+Content-Transfer-Encoding: 8bit
+Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt
+
+"""
+ msg = email.message_from_string(m)
+ self.assertEqual(msg.get_filename(), 'myfile.txt')
+
def test_rfc2231_single_tick_in_filename_extended(self):
eq = self.assertEqual
m = """\
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index 59fcd93..25347ef 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -714,6 +714,18 @@ class TestContentTypeHeader(TestHeaderBase):
" charset*=unknown-8bit''utf-8%E2%80%9D\n",
),
+ 'rfc2231_nonascii_in_charset_of_charset_parameter_value': (
+ "text/plain; charset*=utf-8”''utf-8%E2%80%9D",
+ 'text/plain',
+ 'text',
+ 'plain',
+ {'charset': 'utf-8”'},
+ [],
+ 'text/plain; charset="utf-8”"',
+ "Content-Type: text/plain;"
+ " charset*=utf-8''utf-8%E2%80%9D\n",
+ ),
+
'rfc2231_encoded_then_unencoded_segments': (
('application/x-foo;'
'\tname*0*="us-ascii\'en-us\'My";'
diff --git a/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst b/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst
new file mode 100644
index 0000000..98d7310
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst
@@ -0,0 +1,2 @@
+Fix errors in the :mod:`email` module if the charset itself contains
+undecodable/unencodable characters.