diff options
Diffstat (limited to 'Lib/email/charset.py')
-rw-r--r-- | Lib/email/charset.py | 189 |
1 files changed, 92 insertions, 97 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 1435ee5..9e5ee67 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -9,6 +9,8 @@ __all__ = [ 'add_codec', ] +from functools import partial + import email.base64mime import email.quoprimime @@ -23,9 +25,10 @@ BASE64 = 2 # Base64 SHORTEST = 3 # the shorter of QP and base64, but only for headers # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 -MISC_LEN = 7 +RFC2047_CHROME_LEN = 7 DEFAULT_CHARSET = 'us-ascii' +EMPTYSTRING = '' @@ -259,63 +262,6 @@ class Charset: else: return encode_7or8bit - def convert(self, s): - """Convert a string from the input_codec to the output_codec.""" - if self.input_codec != self.output_codec: - rawbytes = bytes(ord(c) for c in s) - decoded = rawbytes.decode(self.input_codec) - encoded = decoded.encode(self.output_codec) - return str(encoded) - else: - return s - - def to_splittable(self, s): - """Convert a possibly multibyte string to a safely splittable format. - - Uses the input_codec to try and convert the string to Unicode, so it - can be safely split on character boundaries (even for multibyte - characters). - - Returns the string as-is if it isn't known how to convert it to - Unicode with the input_charset. - - Characters that could not be converted to Unicode will be replaced - with the Unicode replacement character U+FFFD. - """ - if isinstance(s, str) or self.input_codec is None: - return s - try: - return str(s, self.input_codec, 'replace') - except LookupError: - # Input codec not installed on system, so return the original - # string unchanged. - return s - - def from_splittable(self, ustr, to_output=True): - """Convert a splittable string back into an encoded string. - - Uses the proper codec to try and convert the string from Unicode back - into an encoded format. Return the string as-is if it is not Unicode, - or if it could not be converted from Unicode. - - Characters that could not be converted from Unicode will be replaced - with an appropriate character (usually '?'). - - If to_output is True (the default), uses output_codec to convert to an - encoded format. If to_output is False, uses input_codec. - """ - if to_output: - codec = self.output_codec - else: - codec = self.input_codec - if not isinstance(ustr, str) or codec is None: - return ustr - try: - return str(ustr.encode(codec, 'replace')) - except LookupError: - # Output codec not installed - return ustr - def get_output_charset(self): """Return the output character set. @@ -324,66 +270,115 @@ class Charset: """ return self.output_charset or self.input_charset - def encoded_header_len(self, s): - """Return the length of the encoded header string.""" - cset = self.get_output_charset() - # The len(s) of a 7bit encoding is len(s) - if self.header_encoding == BASE64: - return email.base64mime.base64_len(s) + len(cset) + MISC_LEN - elif self.header_encoding == QP: - return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN - elif self.header_encoding == SHORTEST: - lenb64 = email.base64mime.base64_len(s) - lenqp = email.quoprimime.header_quopri_len(s) - return min(lenb64, lenqp) + len(cset) + MISC_LEN - else: - return len(s) - def header_encode(self, string): """Header-encode a string by converting it first to bytes. - :param string: A unicode string for the header. This must be - encodable to bytes using the current character set's `output_codec`. - The type of encoding (base64 or quoted-printable) will be based on this charset's `header_encoding`. + + :param string: A unicode string for the header. It must be possible + to encode this string to bytes using the character set's + output codec. + :return: The encoded string, with RFC 2047 chrome. """ codec = self.output_codec or 'us-ascii' charset = self.get_output_charset() header_bytes = string.encode(codec) # 7bit/8bit encodings return the string unchanged (modulo conversions) + encoder_module = self._get_encoder(header_bytes) + if encoder_module is None: + return string + return encoder_module.header_encode(header_bytes, codec) + + def header_encode_lines(self, string, maxlengths): + """Header-encode a string by converting it first to bytes. + + This is similar to `header_encode()` except that the string is fit + into maximum line lengths as given by the arguments. + + :param string: A unicode string for the header. It must be possible + to encode this string to bytes using the character set's + output codec. + :param maxlengths: Maximum line length iterator. Each element + returned from this iterator will provide the next maximum line + length. This parameter is used as an argument to built-in next() + and should never be exhausted. The maximum line lengths should + not count the RFC 2047 chrome. These line lengths are only a + hint; the splitter does the best it can. + :param firstmaxlen: The maximum line length of the first line. If + None (the default), then `maxlen` is used for the first line. + :return: Lines of encoded strings, each with RFC 2047 chrome. + """ + # See which encoding we should use. + codec = self.output_codec or 'us-ascii' + header_bytes = string.encode(codec) + encoder_module = self._get_encoder(header_bytes) + encoder = partial(encoder_module.header_encode, charset=str(self)) + # Calculate the number of characters that the RFC 2047 chrome will + # contribute to each line. + charset = self.get_output_charset() + extra = len(charset) + RFC2047_CHROME_LEN + # Now comes the hard part. We must encode bytes but we can't split on + # bytes because some character sets are variable length and each + # encoded word must stand on its own. So the problem is you have to + # encode to bytes to figure out this word's length, but you must split + # on characters. This causes two problems: first, we don't know how + # many octets a specific substring of unicode characters will get + # encoded to, and second, we don't know how many ASCII characters + # those octets will get encoded to. Unless we try it. Which seems + # inefficient. In the interest of being correct rather than fast (and + # in the hope that there will be few encoded headers in any such + # message), brute force it. :( + lines = [] + current_line = [] + maxlen = next(maxlengths) - extra + for character in string: + current_line.append(character) + this_line = EMPTYSTRING.join(current_line) + length = encoder_module.header_length(this_line.encode(charset)) + if length > maxlen: + # This last character doesn't fit so pop it off. + current_line.pop() + # Does nothing fit on the first line? + if not lines and not current_line: + lines.append(None) + else: + separator = (' ' if lines else '') + joined_line = EMPTYSTRING.join(current_line) + header_bytes = joined_line.encode(codec) + lines.append(encoder(header_bytes)) + current_line = [character] + maxlen = next(maxlengths) - extra + joined_line = EMPTYSTRING.join(current_line) + header_bytes = joined_line.encode(codec) + lines.append(encoder(header_bytes)) + return lines + + def _get_encoder(self, header_bytes): if self.header_encoding == BASE64: - encoder = email.base64mime.header_encode + return email.base64mime elif self.header_encoding == QP: - encoder = email.quoprimime.header_encode + return email.quoprimime elif self.header_encoding == SHORTEST: - lenb64 = email.base64mime.base64_len(header_bytes) - lenqp = email.quoprimime.header_quopri_len(header_bytes) - if lenb64 < lenqp: - encoder = email.base64mime.header_encode + len64 = email.base64mime.header_length(header_bytes) + lenqp = email.quoprimime.header_length(header_bytes) + if len64 < lenqp: + return email.base64mime else: - encoder = email.quoprimime.header_encode + return email.quoprimime else: - return string - return encoder(header_bytes, codec) + return None - def body_encode(self, s, convert=True): - """Body-encode a string and convert it to output_charset. - - If convert is True (the default), the string will be converted from - the input charset to output charset automatically. Unlike - header_encode(), there are no issues with byte boundaries and - multibyte charsets in email bodies, so this is usually pretty safe. + def body_encode(self, string): + """Body-encode a string by converting it first to bytes. The type of encoding (base64 or quoted-printable) will be based on self.body_encoding. """ - if convert: - s = self.convert(s) # 7bit/8bit encodings return the string unchanged (module conversions) if self.body_encoding is BASE64: - return email.base64mime.body_encode(s) + return email.base64mime.body_encode(string) elif self.body_encoding is QP: - return email.quoprimime.body_encode(s) + return email.quoprimime.body_encode(string) else: - return s + return string |