7 files changed, 64 insertions, 5 deletions
diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst
index d4e3fc1..9fadfb3 100644
--- a/Doc/library/email.policy.rst
+++ b/Doc/library/email.policy.rst
@@ -378,6 +378,14 @@ added matters.  To illustrate::
    In addition to the settable attributes listed above that apply to all
    policies, this policy adds the following additional attributes:
 
+   .. attribute:: utf8
+
+      If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
+      headers by encoding them as "encoded words".  If ``True``, follow
+      :rfc:`6532` and use ``utf-8`` encoding for headers.  Messages
+      formatted in this way may be passed to SMTP servers that support
+      the ``SMTPUTF8`` extension (:rfc:`6531`).
+
    .. attribute:: refold_source
 
       If the value for a header in the ``Message`` object originated from a
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
index 0360de4..51a3aa3 100644
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -356,6 +356,12 @@ email
   header (``None`` if there is no such header).  (Contributed by Abhilash Raj
   in :issue:`21083`.)
 
+* A new policy option :attr:`~email.policy.EmailPolicy.utf8` can be set
+  ``True`` to encode email headers using the utf8 charset instead of using
+  encoded words.  This allows ``Messages`` to be formatted according to
+  :rfc:`6532` and used with an SMTP server that supports the :rfc:`6531`
+  ``SMTPUTF8`` extension.  (Contributed by R. David Murray in :issue:`24211`.)
+
 glob
 ----
 
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index a9bdf44..f264191 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -320,17 +320,18 @@ class TokenList(list):
         return ''.join(res)
 
     def _fold(self, folded):
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             tlen = len(tstr)
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                         for x in part.all_defects):
                     charset = 'unknown-8bit'
                 else:
-                    # XXX: this should be a policy setting
+                    # XXX: this should be a policy setting when utf8 is False.
                     charset = 'utf-8'
                 tstr = part.cte_encode(charset, folded.policy)
                 tlen = len(tstr)
@@ -394,11 +395,12 @@ class UnstructuredTokenList(TokenList):
 
     def _fold(self, folded):
         last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             is_ew = False
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                        for x in part.all_defects):
@@ -475,12 +477,13 @@ class Phrase(TokenList):
         # comment that becomes a barrier across which we can't compose encoded
         # words.
         last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             tlen = len(tstr)
             has_ew = False
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                         for x in part.all_defects):
diff --git a/Lib/email/policy.py b/Lib/email/policy.py
index f0b20f4..6ac64a5 100644
--- a/Lib/email/policy.py
+++ b/Lib/email/policy.py
@@ -35,6 +35,13 @@ class EmailPolicy(Policy):
     In addition to the settable attributes listed above that apply to
     all Policies, this policy adds the following additional attributes:
 
+    utf8                -- if False (the default) message headers will be
+                           serialized as ASCII, using encoded words to encode
+                           any non-ASCII characters in the source strings.  If
+                           True, the message headers will be serialized using
+                           utf8 and will not contain encoded words (see RFC
+                           6532 for more on this serialization format).
+
     refold_source       -- if the value for a header in the Message object
                            came from the parsing of some source, this attribute
                            indicates whether or not a generator should refold
@@ -72,6 +79,7 @@ class EmailPolicy(Policy):
 
     """
 
+    utf8 = False
     refold_source = 'long'
     header_factory = HeaderRegistry()
     content_manager = raw_data_manager
@@ -175,9 +183,13 @@ class EmailPolicy(Policy):
         refold_header setting, since there is no way to know whether the binary
         data consists of single byte characters or multibyte characters.
 
+        If utf8 is true, headers are encoded to utf8, otherwise to ascii with
+        non-ASCII unicode rendered as encoded words.
+
         """
         folded = self._fold(name, value, refold_binary=self.cte_type=='7bit')
-        return folded.encode('ascii', 'surrogateescape')
+        charset = 'utf8' if self.utf8 else 'ascii'
+        return folded.encode(charset, 'surrogateescape')
 
     def _fold(self, name, value, refold_binary=False):
         if hasattr(value, 'name'):
@@ -199,3 +211,4 @@ del default.header_factory
 strict = default.clone(raise_on_defect=True)
 SMTP = default.clone(linesep='\r\n')
 HTTP = default.clone(linesep='\r\n', max_line_length=None)
+SMTPUTF8 = SMTP.clone(utf8=True)
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py
index 8917408..920f870 100644
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -2,6 +2,7 @@ import io
 import textwrap
 import unittest
 from email import message_from_string, message_from_bytes
+from email.message import EmailMessage
 from email.generator import Generator, BytesGenerator
 from email import policy
 from test.test_email import TestEmailBase, parameterize
@@ -194,6 +195,27 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
         g.flatten(msg)
         self.assertEqual(s.getvalue(), expected)
 
+    def test_smtputf8_policy(self):
+        msg = EmailMessage()
+        msg['From'] = "Páolo <főo@bar.com>"
+        msg['To'] = 'Dinsdale'
+        msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
+        msg.set_content("oh là là, know what I mean, know what I mean?")
+        expected = textwrap.dedent("""\
+            From: Páolo <főo@bar.com>
+            To: Dinsdale
+            Subject: Nudge nudge, wink, wink \u1F609
+            Content-Type: text/plain; charset="utf-8"
+            Content-Transfer-Encoding: 8bit
+            MIME-Version: 1.0
+
+            oh là là, know what I mean, know what I mean?
+            """).encode('utf-8').replace(b'\n', b'\r\n')
+        s = io.BytesIO()
+        g = BytesGenerator(s, policy=policy.SMTPUTF8)
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py
index e797f36..4b0a04e 100644
--- a/Lib/test/test_email/test_policy.py
+++ b/Lib/test/test_email/test_policy.py
@@ -27,6 +27,7 @@ class PolicyAPITests(unittest.TestCase):
     # If any of these defaults change, the docs must be updated.
     policy_defaults = compat32_defaults.copy()
     policy_defaults.update({
+        'utf8':                     False,
         'raise_on_defect':          False,
         'header_factory':           email.policy.EmailPolicy.header_factory,
         'refold_source':            'long',
@@ -42,6 +43,9 @@ class PolicyAPITests(unittest.TestCase):
         email.policy.default: make_defaults(policy_defaults, {}),
         email.policy.SMTP: make_defaults(policy_defaults,
                                          {'linesep': '\r\n'}),
+        email.policy.SMTPUTF8: make_defaults(policy_defaults,
+                                             {'linesep': '\r\n',
+                                              'utf8': True}),
         email.policy.HTTP: make_defaults(policy_defaults,
                                          {'linesep': '\r\n',
                                           'max_line_length': None}),
diff --git a/Misc/NEWS b/Misc/NEWS
index c905bcc..5ae6031 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #24211: The email library now supports RFC 6532: it can generate
+  headers using utf-8 instead of encoded words.
+
 - Issue #16314: Added support for the LZMA compression in distutils.
 
 - Issue #21804: poplib now supports RFC 6856 (UTF8).