Oops. I copied a slightly older version of the email package from the sandbox.

This should restore the email package in the py3k branch to exactly what's in the sandbox. This wipes out 1-2 fixes made post-copy, which I'll re-apply shortly.
author: Guido van Rossum <guido@python.org> 2007-08-30 03:46:43 (GMT)
committer: Guido van Rossum <guido@python.org> 2007-08-30 03:46:43 (GMT)
commit: 9604e66660bfe5066a88e3eb560a5846c620e8de (patch)
tree: 445e4bdae6ea20847bdfa014ebdab7a1b7eb2233 /Lib/email/charset.py
parent: 2c440a1086e182796a52eeca1fe7c2baa441591d (diff)
download: cpython-9604e66660bfe5066a88e3eb560a5846c620e8de.zip
cpython-9604e66660bfe5066a88e3eb560a5846c620e8de.tar.gz
cpython-9604e66660bfe5066a88e3eb560a5846c620e8de.tar.bz2
1 files changed, 92 insertions, 97 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 1435ee5..9e5ee67 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -9,6 +9,8 @@ __all__ = [
     'add_codec',
     ]
 
+from functools import partial
+
 import email.base64mime
 import email.quoprimime
 
@@ -23,9 +25,10 @@ BASE64      = 2 # Base64
 SHORTEST    = 3 # the shorter of QP and base64, but only for headers
 
 # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
-MISC_LEN = 7
+RFC2047_CHROME_LEN = 7
 
 DEFAULT_CHARSET = 'us-ascii'
+EMPTYSTRING = ''
 
 
 
@@ -259,63 +262,6 @@ class Charset:
         else:
             return encode_7or8bit
 
-    def convert(self, s):
-        """Convert a string from the input_codec to the output_codec."""
-        if self.input_codec != self.output_codec:
-            rawbytes = bytes(ord(c) for c in s)
-            decoded = rawbytes.decode(self.input_codec)
-            encoded = decoded.encode(self.output_codec)
-            return str(encoded)
-        else:
-            return s
-
-    def to_splittable(self, s):
-        """Convert a possibly multibyte string to a safely splittable format.
-
-        Uses the input_codec to try and convert the string to Unicode, so it
-        can be safely split on character boundaries (even for multibyte
-        characters).
-
-        Returns the string as-is if it isn't known how to convert it to
-        Unicode with the input_charset.
-
-        Characters that could not be converted to Unicode will be replaced
-        with the Unicode replacement character U+FFFD.
-        """
-        if isinstance(s, str) or self.input_codec is None:
-            return s
-        try:
-            return str(s, self.input_codec, 'replace')
-        except LookupError:
-            # Input codec not installed on system, so return the original
-            # string unchanged.
-            return s
-
-    def from_splittable(self, ustr, to_output=True):
-        """Convert a splittable string back into an encoded string.
-
-        Uses the proper codec to try and convert the string from Unicode back
-        into an encoded format.  Return the string as-is if it is not Unicode,
-        or if it could not be converted from Unicode.
-
-        Characters that could not be converted from Unicode will be replaced
-        with an appropriate character (usually '?').
-
-        If to_output is True (the default), uses output_codec to convert to an
-        encoded format.  If to_output is False, uses input_codec.
-        """
-        if to_output:
-            codec = self.output_codec
-        else:
-            codec = self.input_codec
-        if not isinstance(ustr, str) or codec is None:
-            return ustr
-        try:
-            return str(ustr.encode(codec, 'replace'))
-        except LookupError:
-            # Output codec not installed
-            return ustr
-
     def get_output_charset(self):
         """Return the output character set.
 
@@ -324,66 +270,115 @@ class Charset:
         """
         return self.output_charset or self.input_charset
 
-    def encoded_header_len(self, s):
-        """Return the length of the encoded header string."""
-        cset = self.get_output_charset()
-        # The len(s) of a 7bit encoding is len(s)
-        if self.header_encoding == BASE64:
-            return email.base64mime.base64_len(s) + len(cset) + MISC_LEN
-        elif self.header_encoding == QP:
-            return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN
-        elif self.header_encoding == SHORTEST:
-            lenb64 = email.base64mime.base64_len(s)
-            lenqp = email.quoprimime.header_quopri_len(s)
-            return min(lenb64, lenqp) + len(cset) + MISC_LEN
-        else:
-            return len(s)
-
     def header_encode(self, string):
         """Header-encode a string by converting it first to bytes.
 
-        :param string: A unicode string for the header.  This must be
-        encodable to bytes using the current character set's `output_codec`.
-
         The type of encoding (base64 or quoted-printable) will be based on
         this charset's `header_encoding`.
+
+        :param string: A unicode string for the header.  It must be possible
+            to encode this string to bytes using the character set's
+            output codec.
+        :return: The encoded string, with RFC 2047 chrome.
         """
         codec = self.output_codec or 'us-ascii'
         charset = self.get_output_charset()
         header_bytes = string.encode(codec)
         # 7bit/8bit encodings return the string unchanged (modulo conversions)
+        encoder_module = self._get_encoder(header_bytes)
+        if encoder_module is None:
+            return string
+        return encoder_module.header_encode(header_bytes, codec)
+
+    def header_encode_lines(self, string, maxlengths):
+        """Header-encode a string by converting it first to bytes.
+
+        This is similar to `header_encode()` except that the string is fit
+        into maximum line lengths as given by the arguments.
+
+        :param string: A unicode string for the header.  It must be possible
+            to encode this string to bytes using the character set's
+            output codec.
+        :param maxlengths: Maximum line length iterator.  Each element
+            returned from this iterator will provide the next maximum line
+            length.  This parameter is used as an argument to built-in next()
+            and should never be exhausted.  The maximum line lengths should
+            not count the RFC 2047 chrome.  These line lengths are only a
+            hint; the splitter does the best it can.
+        :param firstmaxlen: The maximum line length of the first line.  If
+            None (the default), then `maxlen` is used for the first line.
+        :return: Lines of encoded strings, each with RFC 2047 chrome.
+        """
+        # See which encoding we should use.
+        codec = self.output_codec or 'us-ascii'
+        header_bytes = string.encode(codec)
+        encoder_module = self._get_encoder(header_bytes)
+        encoder = partial(encoder_module.header_encode, charset=str(self))
+        # Calculate the number of characters that the RFC 2047 chrome will
+        # contribute to each line.
+        charset = self.get_output_charset()
+        extra = len(charset) + RFC2047_CHROME_LEN
+        # Now comes the hard part.  We must encode bytes but we can't split on
+        # bytes because some character sets are variable length and each
+        # encoded word must stand on its own.  So the problem is you have to
+        # encode to bytes to figure out this word's length, but you must split
+        # on characters.  This causes two problems: first, we don't know how
+        # many octets a specific substring of unicode characters will get
+        # encoded to, and second, we don't know how many ASCII characters
+        # those octets will get encoded to.  Unless we try it.  Which seems
+        # inefficient.  In the interest of being correct rather than fast (and
+        # in the hope that there will be few encoded headers in any such
+        # message), brute force it. :(
+        lines = []
+        current_line = []
+        maxlen = next(maxlengths) - extra
+        for character in string:
+            current_line.append(character)
+            this_line = EMPTYSTRING.join(current_line)
+            length = encoder_module.header_length(this_line.encode(charset))
+            if length > maxlen:
+                # This last character doesn't fit so pop it off.
+                current_line.pop()
+                # Does nothing fit on the first line?
+                if not lines and not current_line:
+                    lines.append(None)
+                else:
+                    separator = (' ' if lines else '')
+                    joined_line = EMPTYSTRING.join(current_line)
+                    header_bytes = joined_line.encode(codec)
+                    lines.append(encoder(header_bytes))
+                current_line = [character]
+                maxlen = next(maxlengths) - extra
+        joined_line = EMPTYSTRING.join(current_line)
+        header_bytes = joined_line.encode(codec)
+        lines.append(encoder(header_bytes))
+        return lines
+
+    def _get_encoder(self, header_bytes):
         if self.header_encoding == BASE64:
-            encoder = email.base64mime.header_encode
+            return email.base64mime
         elif self.header_encoding == QP:
-            encoder = email.quoprimime.header_encode
+            return email.quoprimime
         elif self.header_encoding == SHORTEST:
-            lenb64 = email.base64mime.base64_len(header_bytes)
-            lenqp = email.quoprimime.header_quopri_len(header_bytes)
-            if lenb64 < lenqp:
-                encoder = email.base64mime.header_encode
+            len64 = email.base64mime.header_length(header_bytes)
+            lenqp = email.quoprimime.header_length(header_bytes)
+            if len64 < lenqp:
+                return email.base64mime
             else:
-                encoder = email.quoprimime.header_encode
+                return email.quoprimime
         else:
-            return string
-        return encoder(header_bytes, codec)
+            return None
 
-    def body_encode(self, s, convert=True):
-        """Body-encode a string and convert it to output_charset.
-
-        If convert is True (the default), the string will be converted from
-        the input charset to output charset automatically.  Unlike
-        header_encode(), there are no issues with byte boundaries and
-        multibyte charsets in email bodies, so this is usually pretty safe.
+    def body_encode(self, string):
+        """Body-encode a string by converting it first to bytes.
 
         The type of encoding (base64 or quoted-printable) will be based on
         self.body_encoding.
         """
-        if convert:
-            s = self.convert(s)
         # 7bit/8bit encodings return the string unchanged (module conversions)
         if self.body_encoding is BASE64:
-            return email.base64mime.body_encode(s)
+            return email.base64mime.body_encode(string)
         elif self.body_encoding is QP:
-            return email.quoprimime.body_encode(s)
+            return email.quoprimime.body_encode(string)
         else:
-            return s
+            return string
author	Guido van Rossum <guido@python.org>	2007-08-30 03:46:43 (GMT)
committer	Guido van Rossum <guido@python.org>	2007-08-30 03:46:43 (GMT)
commit	9604e66660bfe5066a88e3eb560a5846c620e8de (patch)
tree	445e4bdae6ea20847bdfa014ebdab7a1b7eb2233 /Lib/email/charset.py
parent	2c440a1086e182796a52eeca1fe7c2baa441591d (diff)
download	cpython-9604e66660bfe5066a88e3eb560a5846c620e8de.zip cpython-9604e66660bfe5066a88e3eb560a5846c620e8de.tar.gz cpython-9604e66660bfe5066a88e3eb560a5846c620e8de.tar.bz2