#24211: Add RFC6532 support to the email library.

This could use more edge case tests, but the basic functionality is tested. (Note that this changeset does not add tailored support for the RFC 6532 message/global MIME type, but the email package generic facilities will handle it.) Reviewed by Maciej Szulik.
author: R David Murray <rdmurray@bitdance.com> 2015-05-17 15:29:21 (GMT)
committer: R David Murray <rdmurray@bitdance.com> 2015-05-17 15:29:21 (GMT)
commit: 224ef3ec3b0758956789c4c98e6cc93704304419 (patch)
tree: 4e06303f8a764f65842b432057845c012916ab5c /Lib/email
parent: c1ecef78a3d5f5db47df66cc24929a66466fdcd4 (diff)
download: cpython-224ef3ec3b0758956789c4c98e6cc93704304419.zip
cpython-224ef3ec3b0758956789c4c98e6cc93704304419.tar.gz
cpython-224ef3ec3b0758956789c4c98e6cc93704304419.tar.bz2
2 files changed, 21 insertions, 5 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index a9bdf44..f264191 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -320,17 +320,18 @@ class TokenList(list):
         return ''.join(res)
 
     def _fold(self, folded):
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             tlen = len(tstr)
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                         for x in part.all_defects):
                     charset = 'unknown-8bit'
                 else:
-                    # XXX: this should be a policy setting
+                    # XXX: this should be a policy setting when utf8 is False.
                     charset = 'utf-8'
                 tstr = part.cte_encode(charset, folded.policy)
                 tlen = len(tstr)
@@ -394,11 +395,12 @@ class UnstructuredTokenList(TokenList):
 
     def _fold(self, folded):
         last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             is_ew = False
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                        for x in part.all_defects):
@@ -475,12 +477,13 @@ class Phrase(TokenList):
         # comment that becomes a barrier across which we can't compose encoded
         # words.
         last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             tlen = len(tstr)
             has_ew = False
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                         for x in part.all_defects):
diff --git a/Lib/email/policy.py b/Lib/email/policy.py
index f0b20f4..6ac64a5 100644
--- a/Lib/email/policy.py
+++ b/Lib/email/policy.py
@@ -35,6 +35,13 @@ class EmailPolicy(Policy):
     In addition to the settable attributes listed above that apply to
     all Policies, this policy adds the following additional attributes:
 
+    utf8                -- if False (the default) message headers will be
+                           serialized as ASCII, using encoded words to encode
+                           any non-ASCII characters in the source strings.  If
+                           True, the message headers will be serialized using
+                           utf8 and will not contain encoded words (see RFC
+                           6532 for more on this serialization format).
+
     refold_source       -- if the value for a header in the Message object
                            came from the parsing of some source, this attribute
                            indicates whether or not a generator should refold
@@ -72,6 +79,7 @@ class EmailPolicy(Policy):
 
     """
 
+    utf8 = False
     refold_source = 'long'
     header_factory = HeaderRegistry()
     content_manager = raw_data_manager
@@ -175,9 +183,13 @@ class EmailPolicy(Policy):
         refold_header setting, since there is no way to know whether the binary
         data consists of single byte characters or multibyte characters.
 
+        If utf8 is true, headers are encoded to utf8, otherwise to ascii with
+        non-ASCII unicode rendered as encoded words.
+
         """
         folded = self._fold(name, value, refold_binary=self.cte_type=='7bit')
-        return folded.encode('ascii', 'surrogateescape')
+        charset = 'utf8' if self.utf8 else 'ascii'
+        return folded.encode(charset, 'surrogateescape')
 
     def _fold(self, name, value, refold_binary=False):
         if hasattr(value, 'name'):
@@ -199,3 +211,4 @@ del default.header_factory
 strict = default.clone(raise_on_defect=True)
 SMTP = default.clone(linesep='\r\n')
 HTTP = default.clone(linesep='\r\n', max_line_length=None)
+SMTPUTF8 = SMTP.clone(utf8=True)
author	R David Murray <rdmurray@bitdance.com>	2015-05-17 15:29:21 (GMT)
committer	R David Murray <rdmurray@bitdance.com>	2015-05-17 15:29:21 (GMT)
commit	224ef3ec3b0758956789c4c98e6cc93704304419 (patch)
tree	4e06303f8a764f65842b432057845c012916ab5c /Lib/email
parent	c1ecef78a3d5f5db47df66cc24929a66466fdcd4 (diff)
download	cpython-224ef3ec3b0758956789c4c98e6cc93704304419.zip cpython-224ef3ec3b0758956789c4c98e6cc93704304419.tar.gz cpython-224ef3ec3b0758956789c4c98e6cc93704304419.tar.bz2