diff options
-rw-r--r-- | Doc/library/email.policy.rst | 8 | ||||
-rw-r--r-- | Doc/whatsnew/3.5.rst | 6 | ||||
-rw-r--r-- | Lib/email/_header_value_parser.py | 11 | ||||
-rw-r--r-- | Lib/email/policy.py | 15 | ||||
-rw-r--r-- | Lib/test/test_email/test_generator.py | 22 | ||||
-rw-r--r-- | Lib/test/test_email/test_policy.py | 4 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
7 files changed, 64 insertions, 5 deletions
diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index d4e3fc1..9fadfb3 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -378,6 +378,14 @@ added matters. To illustrate:: In addition to the settable attributes listed above that apply to all policies, this policy adds the following additional attributes: + .. attribute:: utf8 + + If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in + headers by encoding them as "encoded words". If ``True``, follow + :rfc:`6532` and use ``utf-8`` encoding for headers. Messages + formatted in this way may be passed to SMTP servers that support + the ``SMTPUTF8`` extension (:rfc:`6531`). + .. attribute:: refold_source If the value for a header in the ``Message`` object originated from a diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst index 0360de4..51a3aa3 100644 --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -356,6 +356,12 @@ email header (``None`` if there is no such header). (Contributed by Abhilash Raj in :issue:`21083`.) +* A new policy option :attr:`~email.policy.EmailPolicy.utf8` can be set + ``True`` to encode email headers using the utf8 charset instead of using + encoded words. This allows ``Messages`` to be formatted according to + :rfc:`6532` and used with an SMTP server that supports the :rfc:`6531` + ``SMTPUTF8`` extension. (Contributed by R. David Murray in :issue:`24211`.) + glob ---- diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index a9bdf44..f264191 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -320,17 +320,18 @@ class TokenList(list): return ''.join(res) def _fold(self, folded): + encoding = 'utf-8' if folded.policy.utf8 else 'ascii' for part in self.parts: tstr = str(part) tlen = len(tstr) try: - str(part).encode('us-ascii') + str(part).encode(encoding) except UnicodeEncodeError: if any(isinstance(x, errors.UndecodableBytesDefect) for x in part.all_defects): charset = 'unknown-8bit' else: - # XXX: this should be a policy setting + # XXX: this should be a policy setting when utf8 is False. charset = 'utf-8' tstr = part.cte_encode(charset, folded.policy) tlen = len(tstr) @@ -394,11 +395,12 @@ class UnstructuredTokenList(TokenList): def _fold(self, folded): last_ew = None + encoding = 'utf-8' if folded.policy.utf8 else 'ascii' for part in self.parts: tstr = str(part) is_ew = False try: - str(part).encode('us-ascii') + str(part).encode(encoding) except UnicodeEncodeError: if any(isinstance(x, errors.UndecodableBytesDefect) for x in part.all_defects): @@ -475,12 +477,13 @@ class Phrase(TokenList): # comment that becomes a barrier across which we can't compose encoded # words. last_ew = None + encoding = 'utf-8' if folded.policy.utf8 else 'ascii' for part in self.parts: tstr = str(part) tlen = len(tstr) has_ew = False try: - str(part).encode('us-ascii') + str(part).encode(encoding) except UnicodeEncodeError: if any(isinstance(x, errors.UndecodableBytesDefect) for x in part.all_defects): diff --git a/Lib/email/policy.py b/Lib/email/policy.py index f0b20f4..6ac64a5 100644 --- a/Lib/email/policy.py +++ b/Lib/email/policy.py @@ -35,6 +35,13 @@ class EmailPolicy(Policy): In addition to the settable attributes listed above that apply to all Policies, this policy adds the following additional attributes: + utf8 -- if False (the default) message headers will be + serialized as ASCII, using encoded words to encode + any non-ASCII characters in the source strings. If + True, the message headers will be serialized using + utf8 and will not contain encoded words (see RFC + 6532 for more on this serialization format). + refold_source -- if the value for a header in the Message object came from the parsing of some source, this attribute indicates whether or not a generator should refold @@ -72,6 +79,7 @@ class EmailPolicy(Policy): """ + utf8 = False refold_source = 'long' header_factory = HeaderRegistry() content_manager = raw_data_manager @@ -175,9 +183,13 @@ class EmailPolicy(Policy): refold_header setting, since there is no way to know whether the binary data consists of single byte characters or multibyte characters. + If utf8 is true, headers are encoded to utf8, otherwise to ascii with + non-ASCII unicode rendered as encoded words. + """ folded = self._fold(name, value, refold_binary=self.cte_type=='7bit') - return folded.encode('ascii', 'surrogateescape') + charset = 'utf8' if self.utf8 else 'ascii' + return folded.encode(charset, 'surrogateescape') def _fold(self, name, value, refold_binary=False): if hasattr(value, 'name'): @@ -199,3 +211,4 @@ del default.header_factory strict = default.clone(raise_on_defect=True) SMTP = default.clone(linesep='\r\n') HTTP = default.clone(linesep='\r\n', max_line_length=None) +SMTPUTF8 = SMTP.clone(utf8=True) diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index 8917408..920f870 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -2,6 +2,7 @@ import io import textwrap import unittest from email import message_from_string, message_from_bytes +from email.message import EmailMessage from email.generator import Generator, BytesGenerator from email import policy from test.test_email import TestEmailBase, parameterize @@ -194,6 +195,27 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase): g.flatten(msg) self.assertEqual(s.getvalue(), expected) + def test_smtputf8_policy(self): + msg = EmailMessage() + msg['From'] = "Páolo <főo@bar.com>" + msg['To'] = 'Dinsdale' + msg['Subject'] = 'Nudge nudge, wink, wink \u1F609' + msg.set_content("oh là là, know what I mean, know what I mean?") + expected = textwrap.dedent("""\ + From: Páolo <főo@bar.com> + To: Dinsdale + Subject: Nudge nudge, wink, wink \u1F609 + Content-Type: text/plain; charset="utf-8" + Content-Transfer-Encoding: 8bit + MIME-Version: 1.0 + + oh là là, know what I mean, know what I mean? + """).encode('utf-8').replace(b'\n', b'\r\n') + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.SMTPUTF8) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py index e797f36..4b0a04e 100644 --- a/Lib/test/test_email/test_policy.py +++ b/Lib/test/test_email/test_policy.py @@ -27,6 +27,7 @@ class PolicyAPITests(unittest.TestCase): # If any of these defaults change, the docs must be updated. policy_defaults = compat32_defaults.copy() policy_defaults.update({ + 'utf8': False, 'raise_on_defect': False, 'header_factory': email.policy.EmailPolicy.header_factory, 'refold_source': 'long', @@ -42,6 +43,9 @@ class PolicyAPITests(unittest.TestCase): email.policy.default: make_defaults(policy_defaults, {}), email.policy.SMTP: make_defaults(policy_defaults, {'linesep': '\r\n'}), + email.policy.SMTPUTF8: make_defaults(policy_defaults, + {'linesep': '\r\n', + 'utf8': True}), email.policy.HTTP: make_defaults(policy_defaults, {'linesep': '\r\n', 'max_line_length': None}), @@ -47,6 +47,9 @@ Core and Builtins Library ------- +- Issue #24211: The email library now supports RFC 6532: it can generate + headers using utf-8 instead of encoded words. + - Issue #16314: Added support for the LZMA compression in distutils. - Issue #21804: poplib now supports RFC 6856 (UTF8). |