summaryrefslogtreecommitdiffstats
path: root/Lib/email/charset.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/email/charset.py')
-rw-r--r--Lib/email/charset.py22
1 files changed, 16 insertions, 6 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 898beed..8591527 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers
RFC2047_CHROME_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
EMPTYSTRING = ''
@@ -153,6 +154,16 @@ def add_codec(charset, codecname):
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+ if codec == UNKNOWN8BIT:
+ return string.encode('ascii', 'surrogateescape')
+ else:
+ return string.encode(codec)
+
+
+
class Charset:
"""Map character sets to their email properties.
@@ -282,8 +293,7 @@ class Charset:
:return: The encoded string, with RFC 2047 chrome.
"""
codec = self.output_codec or 'us-ascii'
- charset = self.get_output_charset()
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
encoder_module = self._get_encoder(header_bytes)
if encoder_module is None:
@@ -309,7 +319,7 @@ class Charset:
"""
# See which encoding we should use.
codec = self.output_codec or 'us-ascii'
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
encoder_module = self._get_encoder(header_bytes)
encoder = partial(encoder_module.header_encode, charset=str(self))
# Calculate the number of characters that the RFC 2047 chrome will
@@ -333,7 +343,7 @@ class Charset:
for character in string:
current_line.append(character)
this_line = EMPTYSTRING.join(current_line)
- length = encoder_module.header_length(this_line.encode(charset))
+ length = encoder_module.header_length(_encode(this_line, charset))
if length > maxlen:
# This last character doesn't fit so pop it off.
current_line.pop()
@@ -343,12 +353,12 @@ class Charset:
else:
separator = (' ' if lines else '')
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
current_line = [character]
maxlen = next(maxlengths) - extra
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
return lines