summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/email/message.py29
-rw-r--r--Lib/email/utils.py4
-rw-r--r--Lib/test/test_email/test_message.py29
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst3
4 files changed, 49 insertions, 16 deletions
diff --git a/Lib/email/message.py b/Lib/email/message.py
index 411118c..fe76958 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -289,25 +289,26 @@ class Message:
# cte might be a Header, so for now stringify it.
cte = str(self.get('content-transfer-encoding', '')).lower()
# payload may be bytes here.
- if isinstance(payload, str):
- if utils._has_surrogates(payload):
- bpayload = payload.encode('ascii', 'surrogateescape')
- if not decode:
+ if not decode:
+ if isinstance(payload, str) and utils._has_surrogates(payload):
+ try:
+ bpayload = payload.encode('ascii', 'surrogateescape')
try:
payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
except LookupError:
payload = bpayload.decode('ascii', 'replace')
- elif decode:
- try:
- bpayload = payload.encode('ascii')
- except UnicodeError:
- # This won't happen for RFC compliant messages (messages
- # containing only ASCII code points in the unicode input).
- # If it does happen, turn the string into bytes in a way
- # guaranteed not to fail.
- bpayload = payload.encode('raw-unicode-escape')
- if not decode:
+ except UnicodeEncodeError:
+ pass
return payload
+ if isinstance(payload, str):
+ try:
+ bpayload = payload.encode('ascii', 'surrogateescape')
+ except UnicodeEncodeError:
+ # This won't happen for RFC compliant messages (messages
+ # containing only ASCII code points in the unicode input).
+ # If it does happen, turn the string into bytes in a way
+ # guaranteed not to fail.
+ bpayload = payload.encode('raw-unicode-escape')
if cte == 'quoted-printable':
return quopri.decodestring(bpayload)
elif cte == 'base64':
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index a49a8fa..9175f2f 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -44,10 +44,10 @@ specialsre = re.compile(r'[][\\()<>@,:;".]')
escapesre = re.compile(r'[\\"]')
def _has_surrogates(s):
- """Return True if s contains surrogate-escaped binary data."""
+ """Return True if s may contain surrogate-escaped binary data."""
# This check is based on the fact that unless there are surrogates, utf8
# (Python's default encoding) can encode any string. This is the fastest
- # way to check for surrogates, see issue 11454 for timings.
+ # way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
try:
s.encode()
return False
diff --git a/Lib/test/test_email/test_message.py b/Lib/test/test_email/test_message.py
index d3f396f..034f762 100644
--- a/Lib/test/test_email/test_message.py
+++ b/Lib/test/test_email/test_message.py
@@ -748,6 +748,35 @@ class TestEmailMessageBase:
self.assertEqual(len(list(m.iter_attachments())), 2)
self.assertEqual(m.get_payload(), orig)
+ get_payload_surrogate_params = {
+
+ 'good_surrogateescape': (
+ "String that can be encod\udcc3\udcabd with surrogateescape",
+ b'String that can be encod\xc3\xabd with surrogateescape'
+ ),
+
+ 'string_with_utf8': (
+ "String with utf-8 charactër",
+ b'String with utf-8 charact\xebr'
+ ),
+
+ 'surrogate_and_utf8': (
+ "String that cannot be ëncod\udcc3\udcabd with surrogateescape",
+ b'String that cannot be \xebncod\\udcc3\\udcabd with surrogateescape'
+ ),
+
+ 'out_of_range_surrogate': (
+ "String with \udfff cannot be encoded with surrogateescape",
+ b'String with \\udfff cannot be encoded with surrogateescape'
+ ),
+ }
+
+ def get_payload_surrogate_as_gh_94606(self, msg, expected):
+ """test for GH issue 94606"""
+ m = self._str_msg(msg)
+ payload = m.get_payload(decode=True)
+ self.assertEqual(expected, payload)
+
class TestEmailMessage(TestEmailMessageBase, TestEmailBase):
message = EmailMessage
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst b/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst
new file mode 100644
index 0000000..5201ab7
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst
@@ -0,0 +1,3 @@
+Fix UnicodeEncodeError when :func:`email.message.get_payload` reads a message
+with a Unicode surrogate character and the message content is not well-formed for
+surrogateescape encoding. Patch by Sidney Markowitz.