summaryrefslogtreecommitdiffstats
path: root/Lib/email/charset.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/email/charset.py')
-rw-r--r--Lib/email/charset.py189
1 files changed, 92 insertions, 97 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 1435ee5..9e5ee67 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -9,6 +9,8 @@ __all__ = [
'add_codec',
]
+from functools import partial
+
import email.base64mime
import email.quoprimime
@@ -23,9 +25,10 @@ BASE64 = 2 # Base64
SHORTEST = 3 # the shorter of QP and base64, but only for headers
# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
-MISC_LEN = 7
+RFC2047_CHROME_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
+EMPTYSTRING = ''
@@ -259,63 +262,6 @@ class Charset:
else:
return encode_7or8bit
- def convert(self, s):
- """Convert a string from the input_codec to the output_codec."""
- if self.input_codec != self.output_codec:
- rawbytes = bytes(ord(c) for c in s)
- decoded = rawbytes.decode(self.input_codec)
- encoded = decoded.encode(self.output_codec)
- return str(encoded)
- else:
- return s
-
- def to_splittable(self, s):
- """Convert a possibly multibyte string to a safely splittable format.
-
- Uses the input_codec to try and convert the string to Unicode, so it
- can be safely split on character boundaries (even for multibyte
- characters).
-
- Returns the string as-is if it isn't known how to convert it to
- Unicode with the input_charset.
-
- Characters that could not be converted to Unicode will be replaced
- with the Unicode replacement character U+FFFD.
- """
- if isinstance(s, str) or self.input_codec is None:
- return s
- try:
- return str(s, self.input_codec, 'replace')
- except LookupError:
- # Input codec not installed on system, so return the original
- # string unchanged.
- return s
-
- def from_splittable(self, ustr, to_output=True):
- """Convert a splittable string back into an encoded string.
-
- Uses the proper codec to try and convert the string from Unicode back
- into an encoded format. Return the string as-is if it is not Unicode,
- or if it could not be converted from Unicode.
-
- Characters that could not be converted from Unicode will be replaced
- with an appropriate character (usually '?').
-
- If to_output is True (the default), uses output_codec to convert to an
- encoded format. If to_output is False, uses input_codec.
- """
- if to_output:
- codec = self.output_codec
- else:
- codec = self.input_codec
- if not isinstance(ustr, str) or codec is None:
- return ustr
- try:
- return str(ustr.encode(codec, 'replace'))
- except LookupError:
- # Output codec not installed
- return ustr
-
def get_output_charset(self):
"""Return the output character set.
@@ -324,66 +270,115 @@ class Charset:
"""
return self.output_charset or self.input_charset
- def encoded_header_len(self, s):
- """Return the length of the encoded header string."""
- cset = self.get_output_charset()
- # The len(s) of a 7bit encoding is len(s)
- if self.header_encoding == BASE64:
- return email.base64mime.base64_len(s) + len(cset) + MISC_LEN
- elif self.header_encoding == QP:
- return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN
- elif self.header_encoding == SHORTEST:
- lenb64 = email.base64mime.base64_len(s)
- lenqp = email.quoprimime.header_quopri_len(s)
- return min(lenb64, lenqp) + len(cset) + MISC_LEN
- else:
- return len(s)
-
def header_encode(self, string):
"""Header-encode a string by converting it first to bytes.
- :param string: A unicode string for the header. This must be
- encodable to bytes using the current character set's `output_codec`.
-
The type of encoding (base64 or quoted-printable) will be based on
this charset's `header_encoding`.
+
+ :param string: A unicode string for the header. It must be possible
+ to encode this string to bytes using the character set's
+ output codec.
+ :return: The encoded string, with RFC 2047 chrome.
"""
codec = self.output_codec or 'us-ascii'
charset = self.get_output_charset()
header_bytes = string.encode(codec)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
+ encoder_module = self._get_encoder(header_bytes)
+ if encoder_module is None:
+ return string
+ return encoder_module.header_encode(header_bytes, codec)
+
+ def header_encode_lines(self, string, maxlengths):
+ """Header-encode a string by converting it first to bytes.
+
+ This is similar to `header_encode()` except that the string is fit
+ into maximum line lengths as given by the arguments.
+
+ :param string: A unicode string for the header. It must be possible
+ to encode this string to bytes using the character set's
+ output codec.
+ :param maxlengths: Maximum line length iterator. Each element
+ returned from this iterator will provide the next maximum line
+ length. This parameter is used as an argument to built-in next()
+ and should never be exhausted. The maximum line lengths should
+ not count the RFC 2047 chrome. These line lengths are only a
+ hint; the splitter does the best it can.
+ :param firstmaxlen: The maximum line length of the first line. If
+ None (the default), then `maxlen` is used for the first line.
+ :return: Lines of encoded strings, each with RFC 2047 chrome.
+ """
+ # See which encoding we should use.
+ codec = self.output_codec or 'us-ascii'
+ header_bytes = string.encode(codec)
+ encoder_module = self._get_encoder(header_bytes)
+ encoder = partial(encoder_module.header_encode, charset=str(self))
+ # Calculate the number of characters that the RFC 2047 chrome will
+ # contribute to each line.
+ charset = self.get_output_charset()
+ extra = len(charset) + RFC2047_CHROME_LEN
+ # Now comes the hard part. We must encode bytes but we can't split on
+ # bytes because some character sets are variable length and each
+ # encoded word must stand on its own. So the problem is you have to
+ # encode to bytes to figure out this word's length, but you must split
+ # on characters. This causes two problems: first, we don't know how
+ # many octets a specific substring of unicode characters will get
+ # encoded to, and second, we don't know how many ASCII characters
+ # those octets will get encoded to. Unless we try it. Which seems
+ # inefficient. In the interest of being correct rather than fast (and
+ # in the hope that there will be few encoded headers in any such
+ # message), brute force it. :(
+ lines = []
+ current_line = []
+ maxlen = next(maxlengths) - extra
+ for character in string:
+ current_line.append(character)
+ this_line = EMPTYSTRING.join(current_line)
+ length = encoder_module.header_length(this_line.encode(charset))
+ if length > maxlen:
+ # This last character doesn't fit so pop it off.
+ current_line.pop()
+ # Does nothing fit on the first line?
+ if not lines and not current_line:
+ lines.append(None)
+ else:
+ separator = (' ' if lines else '')
+ joined_line = EMPTYSTRING.join(current_line)
+ header_bytes = joined_line.encode(codec)
+ lines.append(encoder(header_bytes))
+ current_line = [character]
+ maxlen = next(maxlengths) - extra
+ joined_line = EMPTYSTRING.join(current_line)
+ header_bytes = joined_line.encode(codec)
+ lines.append(encoder(header_bytes))
+ return lines
+
+ def _get_encoder(self, header_bytes):
if self.header_encoding == BASE64:
- encoder = email.base64mime.header_encode
+ return email.base64mime
elif self.header_encoding == QP:
- encoder = email.quoprimime.header_encode
+ return email.quoprimime
elif self.header_encoding == SHORTEST:
- lenb64 = email.base64mime.base64_len(header_bytes)
- lenqp = email.quoprimime.header_quopri_len(header_bytes)
- if lenb64 < lenqp:
- encoder = email.base64mime.header_encode
+ len64 = email.base64mime.header_length(header_bytes)
+ lenqp = email.quoprimime.header_length(header_bytes)
+ if len64 < lenqp:
+ return email.base64mime
else:
- encoder = email.quoprimime.header_encode
+ return email.quoprimime
else:
- return string
- return encoder(header_bytes, codec)
+ return None
- def body_encode(self, s, convert=True):
- """Body-encode a string and convert it to output_charset.
-
- If convert is True (the default), the string will be converted from
- the input charset to output charset automatically. Unlike
- header_encode(), there are no issues with byte boundaries and
- multibyte charsets in email bodies, so this is usually pretty safe.
+ def body_encode(self, string):
+ """Body-encode a string by converting it first to bytes.
The type of encoding (base64 or quoted-printable) will be based on
self.body_encoding.
"""
- if convert:
- s = self.convert(s)
# 7bit/8bit encodings return the string unchanged (module conversions)
if self.body_encoding is BASE64:
- return email.base64mime.body_encode(s)
+ return email.base64mime.body_encode(string)
elif self.body_encoding is QP:
- return email.quoprimime.body_encode(s)
+ return email.quoprimime.body_encode(string)
else:
- return s
+ return string