diff options
Diffstat (limited to 'Lib/email/charset.py')
-rw-r--r-- | Lib/email/charset.py | 22 |
1 files changed, 16 insertions, 6 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 898beed..8591527 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers RFC2047_CHROME_LEN = 7 DEFAULT_CHARSET = 'us-ascii' +UNKNOWN8BIT = 'unknown-8bit' EMPTYSTRING = '' @@ -153,6 +154,16 @@ def add_codec(charset, codecname): +# Convenience function for encoding strings, taking into account +# that they might be unknown-8bit (ie: have surrogate-escaped bytes) +def _encode(string, codec): + if codec == UNKNOWN8BIT: + return string.encode('ascii', 'surrogateescape') + else: + return string.encode(codec) + + + class Charset: """Map character sets to their email properties. @@ -282,8 +293,7 @@ class Charset: :return: The encoded string, with RFC 2047 chrome. """ codec = self.output_codec or 'us-ascii' - charset = self.get_output_charset() - header_bytes = string.encode(codec) + header_bytes = _encode(string, codec) # 7bit/8bit encodings return the string unchanged (modulo conversions) encoder_module = self._get_encoder(header_bytes) if encoder_module is None: @@ -309,7 +319,7 @@ class Charset: """ # See which encoding we should use. codec = self.output_codec or 'us-ascii' - header_bytes = string.encode(codec) + header_bytes = _encode(string, codec) encoder_module = self._get_encoder(header_bytes) encoder = partial(encoder_module.header_encode, charset=str(self)) # Calculate the number of characters that the RFC 2047 chrome will @@ -333,7 +343,7 @@ class Charset: for character in string: current_line.append(character) this_line = EMPTYSTRING.join(current_line) - length = encoder_module.header_length(this_line.encode(charset)) + length = encoder_module.header_length(_encode(this_line, charset)) if length > maxlen: # This last character doesn't fit so pop it off. current_line.pop() @@ -343,12 +353,12 @@ class Charset: else: separator = (' ' if lines else '') joined_line = EMPTYSTRING.join(current_line) - header_bytes = joined_line.encode(codec) + header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) current_line = [character] maxlen = next(maxlengths) - extra joined_line = EMPTYSTRING.join(current_line) - header_bytes = joined_line.encode(codec) + header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) return lines |