1 files changed, 16 insertions, 6 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 898beed..8591527 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -28,6 +28,7 @@ SHORTEST    = 3 # the shorter of QP and base64, but only for headers
 RFC2047_CHROME_LEN = 7
 
 DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
 EMPTYSTRING = ''
 
 
@@ -153,6 +154,16 @@ def add_codec(charset, codecname):
 
 
 
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+    if codec == UNKNOWN8BIT:
+        return string.encode('ascii', 'surrogateescape')
+    else:
+        return string.encode(codec)
+
+
+
 class Charset:
     """Map character sets to their email properties.
 
@@ -282,8 +293,7 @@ class Charset:
         :return: The encoded string, with RFC 2047 chrome.
         """
         codec = self.output_codec or 'us-ascii'
-        charset = self.get_output_charset()
-        header_bytes = string.encode(codec)
+        header_bytes = _encode(string, codec)
         # 7bit/8bit encodings return the string unchanged (modulo conversions)
         encoder_module = self._get_encoder(header_bytes)
         if encoder_module is None:
@@ -309,7 +319,7 @@ class Charset:
         """
         # See which encoding we should use.
         codec = self.output_codec or 'us-ascii'
-        header_bytes = string.encode(codec)
+        header_bytes = _encode(string, codec)
         encoder_module = self._get_encoder(header_bytes)
         encoder = partial(encoder_module.header_encode, charset=str(self))
         # Calculate the number of characters that the RFC 2047 chrome will
@@ -333,7 +343,7 @@ class Charset:
         for character in string:
             current_line.append(character)
             this_line = EMPTYSTRING.join(current_line)
-            length = encoder_module.header_length(this_line.encode(charset))
+            length = encoder_module.header_length(_encode(this_line, charset))
             if length > maxlen:
                 # This last character doesn't fit so pop it off.
                 current_line.pop()
@@ -343,12 +353,12 @@ class Charset:
                 else:
                     separator = (' ' if lines else '')
                     joined_line = EMPTYSTRING.join(current_line)
-                    header_bytes = joined_line.encode(codec)
+                    header_bytes = _encode(joined_line, codec)
                     lines.append(encoder(header_bytes))
                 current_line = [character]
                 maxlen = next(maxlengths) - extra
         joined_line = EMPTYSTRING.join(current_line)
-        header_bytes = joined_line.encode(codec)
+        header_bytes = _encode(joined_line, codec)
         lines.append(encoder(header_bytes))
         return lines