Added a feature suggested by Martin v Loewis, where a new header

encoding flag SHORTEST means to return the shortest encoding between base64 and qp. This is used for the header_enc for utf-8. SHORTEST isn't legal for body_enc. Also some code cleanup: - use True/False everywhere - use == instead of `is' in a few places - added _unicode() and make consistent the "is unicode" checks - update docstrings
author: Barry Warsaw <barry@python.org> 2002-09-28 17:47:56 (GMT)
committer: Barry Warsaw <barry@python.org> 2002-09-28 17:47:56 (GMT)
commit: 5932c9bedd959eb8a25b8d3ad40907142cbaa4f4 (patch)
tree: 052b728647b84c14b209a75b773c14f8bf0d918f /Lib/email/Charset.py
parent: 176916a98928c1828acd997f5bc11863f0d36c71 (diff)
download: cpython-5932c9bedd959eb8a25b8d3ad40907142cbaa4f4.zip
cpython-5932c9bedd959eb8a25b8d3ad40907142cbaa4f4.tar.gz
cpython-5932c9bedd959eb8a25b8d3ad40907142cbaa4f4.tar.bz2
1 files changed, 55 insertions, 37 deletions
diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py
index c0fe9d7..2050eb1 100644
--- a/Lib/email/Charset.py
+++ b/Lib/email/Charset.py
@@ -1,26 +1,27 @@
 # Copyright (C) 2001,2002 Python Software Foundation
-# Author: che@debian.org (Ben Gertzfield)
-
-try:
-    unicode
-except NameError:
-    def _is_unicode(x):
-        return 1==0
-else:
-    # Use UnicodeType instead of built-in unicode for Py2.1 compatibility
-    from types import UnicodeType
-    def _is_unicode(x):
-        return isinstance(x, UnicodeType)
+# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
 
+from types import UnicodeType
 from email.Encoders import encode_7or8bit
 import email.base64MIME
 import email.quopriMIME
 
+def _isunicode(s):
+    return isinstance(s, UnicodeType)
+
+# Python 2.2.1 and beyond has these symbols
+try:
+    True, False
+except NameError:
+    True = 1
+    False = 0
+
 
 
 # Flags for types of header encodings
-QP     = 1  # Quoted-Printable
-BASE64 = 2  # Base64
+QP     = 1   # Quoted-Printable
+BASE64 = 2   # Base64
+SHORTEST = 3 # the shorter of QP and base64, but only for headers
 
 # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
 MISC_LEN = 7
@@ -41,7 +42,7 @@ CHARSETS = {
     'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
     'iso-2022-jp': (BASE64,    None,    None),
     'koi8-r':      (BASE64,    BASE64,  None),
-    'utf-8':       (BASE64,    BASE64,  'utf-8'),
+    'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
     }
 
 # Aliases for other commonly-used names for character sets.  Map
@@ -90,9 +91,11 @@ def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
     character set.
 
     Optional header_enc and body_enc is either Charset.QP for
-    quoted-printable, Charset.BASE64 for base64 encoding, or None for no
-    encoding.  It describes how message headers and message bodies in the
-    input charset are to be encoded.  Default is no encoding.
+    quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
+    the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
+    is only valid for header_enc.  It describes how message headers and
+    message bodies in the input charset are to be encoded.  Default is no
+    encoding.
 
     Optional output_charset is the character set that the output should be
     in.  Conversions will proceed from input charset, to Unicode, to the
@@ -104,6 +107,8 @@ def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
     to add codecs the module does not know about.  See the codec module's
     documentation for more information.
     """
+    if body_enc == SHORTEST:
+        raise ValueError, 'SHORTEST not allowed for body_enc'
     CHARSETS[charset] = (header_enc, body_enc, output_charset)
 
 
@@ -147,12 +152,14 @@ class Charset:
 
     header_encoding: If the character set must be encoded before it can be
                      used in an email header, this attribute will be set to
-                     Charset.QP (for quoted-printable) or Charset.BASE64 (for
-                     base64 encoding).  Otherwise, it will be None.
+                     Charset.QP (for quoted-printable), Charset.BASE64 (for
+                     base64 encoding), or Charset.SHORTEST for the shortest of
+                     QP or BASE64 encoding.  Otherwise, it will be None.
 
     body_encoding: Same as header_encoding, but describes the encoding for the
                    mail message's body, which indeed may be different than the
-                   header encoding.
+                   header encoding.  Charset.SHORTEST is not allowed for
+                   body_encoding.
 
     output_charset: Some character sets must be converted before the can be
                     used in email headers or bodies.  If the input_charset is
@@ -175,7 +182,7 @@ class Charset:
         # charset_map dictionary.  Try that first, but let the user override
         # it.
         henc, benc, conv = CHARSETS.get(self.input_charset,
-                                        (BASE64, BASE64, None))
+                                        (SHORTEST, SHORTEST, None))
         # Set the attributes, allowing the arguments to override the default.
         self.header_encoding = henc
         self.body_encoding = benc
@@ -209,6 +216,7 @@ class Charset:
         Returns "base64" if self.body_encoding is BASE64.
         Returns "7bit" otherwise.
         """
+        assert self.body_encoding <> SHORTEST
         if self.body_encoding == QP:
             return 'quoted-printable'
         elif self.body_encoding == BASE64:
@@ -236,7 +244,7 @@ class Charset:
         Characters that could not be converted to Unicode will be replaced
         with the Unicode replacement character U+FFFD.
         """
-        if _is_unicode(s) or self.input_codec is None:
+        if _isunicode(s) or self.input_codec is None:
             return s
         try:
             return unicode(s, self.input_codec, 'replace')
@@ -245,7 +253,7 @@ class Charset:
             # string unchanged.
             return s
 
-    def from_splittable(self, ustr, to_output=1):
+    def from_splittable(self, ustr, to_output=True):
         """Convert a splittable string back into an encoded string.
 
         Uses the proper codec to try and convert the string from
@@ -256,15 +264,14 @@ class Charset:
         Characters that could not be converted from Unicode will be replaced
         with an appropriate character (usually '?').
 
-        If to_output is true, uses output_codec to convert to an encoded
-        format.  If to_output is false, uses input_codec.  to_output defaults
-        to 1.
+        If to_output is True (the default), uses output_codec to convert to an
+        encoded format.  If to_output is False, uses input_codec.
         """
         if to_output:
             codec = self.output_codec
         else:
             codec = self.input_codec
-        if not _is_unicode(ustr) or codec is None:
+        if not _isunicode(ustr) or codec is None:
             return ustr
         try:
             return ustr.encode(codec, 'replace')
@@ -284,22 +291,26 @@ class Charset:
         """Return the length of the encoded header string."""
         cset = self.get_output_charset()
         # The len(s) of a 7bit encoding is len(s)
-        if self.header_encoding is BASE64:
+        if self.header_encoding == BASE64:
             return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
-        elif self.header_encoding is QP:
+        elif self.header_encoding == QP:
             return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
+        elif self.header_encoding == SHORTEST:
+            lenb64 = email.base64MIME.base64_len(s)
+            lenqp = email.quopriMIME.header_quopri_len(s)
+            return min(lenb64, lenqp) + len(cset) + MISC_LEN
         else:
             return len(s)
 
-    def header_encode(self, s, convert=0):
+    def header_encode(self, s, convert=False):
         """Header-encode a string, optionally converting it to output_charset.
 
-        If convert is true, the string will be converted from the input
+        If convert is True, the string will be converted from the input
         charset to the output charset automatically.  This is not useful for
         multibyte character sets, which have line length issues (multibyte
         characters must be split on a character, not a byte boundary); use the
         high-level Header class to deal with these issues.  convert defaults
-        to 0.
+        to False.
 
         The type of encoding (base64 or quoted-printable) will be based on
         self.header_encoding.
@@ -308,17 +319,24 @@ class Charset:
         if convert:
             s = self.convert(s)
         # 7bit/8bit encodings return the string unchanged (modulo conversions)
-        if self.header_encoding is BASE64:
+        if self.header_encoding == BASE64:
             return email.base64MIME.header_encode(s, cset)
-        elif self.header_encoding is QP:
+        elif self.header_encoding == QP:
             return email.quopriMIME.header_encode(s, cset)
+        elif self.header_encoding == SHORTEST:
+            lenb64 = email.base64MIME.base64_len(s)
+            lenqp = email.quopriMIME.header_quopri_len(s)
+            if lenb64 < lenqp:
+                return email.base64MIME.header_encode(s, cset)
+            else:
+                return email.quopriMIME.header_encode(s, cset)
         else:
             return s
 
-    def body_encode(self, s, convert=1):
+    def body_encode(self, s, convert=True):
         """Body-encode a string and convert it to output_charset.
 
-        If convert is true (the default), the string will be converted from
+        If convert is True (the default), the string will be converted from
         the input charset to output charset automatically.  Unlike
         header_encode(), there are no issues with byte boundaries and
         multibyte charsets in email bodies, so this is usually pretty safe.
author	Barry Warsaw <barry@python.org>	2002-09-28 17:47:56 (GMT)
committer	Barry Warsaw <barry@python.org>	2002-09-28 17:47:56 (GMT)
commit	5932c9bedd959eb8a25b8d3ad40907142cbaa4f4 (patch)
tree	052b728647b84c14b209a75b773c14f8bf0d918f /Lib/email/Charset.py
parent	176916a98928c1828acd997f5bc11863f0d36c71 (diff)
download	cpython-5932c9bedd959eb8a25b8d3ad40907142cbaa4f4.zip cpython-5932c9bedd959eb8a25b8d3ad40907142cbaa4f4.tar.gz cpython-5932c9bedd959eb8a25b8d3ad40907142cbaa4f4.tar.bz2