#14291: if a header has non-ascii unicode, default to CTE using utf-8

In Python2, if a unicode string was assigned as the value of a header, email would automatically CTE encode it using the UTF8 charset. This capability was lost in the Python3 translation, and this patch restores it. Patch by Ali Ikinci, assisted by R. David Murray. I also added a fix for the mailbox test that was depending (with a comment that it was a bad idea to so depend) on non-ASCII causing message_from_string to raise an error. It now uses support.patch to induce an error during message serialization.
author: R David Murray <rdmurray@bitdance.com> 2012-03-14 06:59:51 (GMT)
committer: R David Murray <rdmurray@bitdance.com> 2012-03-14 06:59:51 (GMT)
commit: 7441a7aedd251d529eb14eff9a16708e9cb32409 (patch)
tree: 1d525eb5ac468752cacf460b4228a0150ee48814 /Lib/email
parent: 21c71bac5f684b0ec1665d841d05f91e078c3964 (diff)
download: cpython-7441a7aedd251d529eb14eff9a16708e9cb32409.zip
cpython-7441a7aedd251d529eb14eff9a16708e9cb32409.tar.gz
cpython-7441a7aedd251d529eb14eff9a16708e9cb32409.tar.bz2
2 files changed, 25 insertions, 3 deletions
diff --git a/Lib/email/header.py b/Lib/email/header.py
index 2e687b7..3250d36 100644
--- a/Lib/email/header.py
+++ b/Lib/email/header.py
@@ -283,7 +283,12 @@ class Header:
         # character set, otherwise an early error is thrown.
         output_charset = charset.output_codec or 'us-ascii'
         if output_charset != _charset.UNKNOWN8BIT:
-            s.encode(output_charset, errors)
+            try:
+                s.encode(output_charset, errors)
+            except UnicodeEncodeError:
+                if output_charset!='us-ascii':
+                    raise
+                charset = UTF8
         self._chunks.append((s, charset))
 
     def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index 102e15b..f43bb38 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -619,6 +619,19 @@ class TestMessageAPI(TestEmailBase):
         msg['Dummy'] = 'dummy\nX-Injected-Header: test'
         self.assertRaises(errors.HeaderParseError, msg.as_string)
 
+    def test_unicode_header_defaults_to_utf8_encoding(self):
+        # Issue 14291
+        m = MIMEText('abc\n')
+        m['Subject'] = 'É test'
+        self.assertEqual(str(m),textwrap.dedent("""\
+            Content-Type: text/plain; charset="us-ascii"
+            MIME-Version: 1.0
+            Content-Transfer-Encoding: 7bit
+            Subject: =?utf-8?q?=C3=89_test?=
+
+            abc
+            """))
+
 # Test the email.encoders module
 class TestEncoders(unittest.TestCase):
 
@@ -1060,9 +1073,13 @@ Subject: =?iso-8859-1?q?Britische_Regierung_gibt_gr=FCnes_Licht_f=FCr_Offshore-W
                          'f\xfcr Offshore-Windkraftprojekte '
                          '<a-very-long-address@example.com>')
         msg['Reply-To'] = header_string
-        self.assertRaises(UnicodeEncodeError, msg.as_string)
+        eq(msg.as_string(maxheaderlen=78), """\
+Reply-To: =?utf-8?q?Britische_Regierung_gibt_gr=C3=BCnes_Licht_f=C3=BCr_Offs?=
+ =?utf-8?q?hore-Windkraftprojekte_=3Ca-very-long-address=40example=2Ecom=3E?=
+
+""")
         msg = Message()
-        msg['Reply-To'] = Header(header_string, 'utf-8',
+        msg['Reply-To'] = Header(header_string,
                                  header_name='Reply-To')
         eq(msg.as_string(maxheaderlen=78), """\
 Reply-To: =?utf-8?q?Britische_Regierung_gibt_gr=C3=BCnes_Licht_f=C3=BCr_Offs?=
author	R David Murray <rdmurray@bitdance.com>	2012-03-14 06:59:51 (GMT)
committer	R David Murray <rdmurray@bitdance.com>	2012-03-14 06:59:51 (GMT)
commit	7441a7aedd251d529eb14eff9a16708e9cb32409 (patch)
tree	1d525eb5ac468752cacf460b4228a0150ee48814 /Lib/email
parent	21c71bac5f684b0ec1665d841d05f91e078c3964 (diff)
download	cpython-7441a7aedd251d529eb14eff9a16708e9cb32409.zip cpython-7441a7aedd251d529eb14eff9a16708e9cb32409.tar.gz cpython-7441a7aedd251d529eb14eff9a16708e9cb32409.tar.bz2