Fixing some RFC 2231 related issues as reported in the Spambayes

project, and with assistance from Oleg Broytmann. Specifically, get_param(), get_params(): Document that these methods may return parameter values that are either strings, or 3-tuples in the case of RFC 2231 encoded parameters. The application should be prepared to deal with such return values. get_boundary(): Be prepared to deal with RFC 2231 encoded boundary parameters. It makes little sense to have boundaries that are anything but ascii, so if we get back a 3-tuple from get_param() we will decode it into ascii and let any failures percolate up. get_content_charset(): New method which treats the charset parameter just like the boundary parameter in get_boundary(). Note that "get_charset()" was already taken to return the default Charset object. get_charsets(): Rewrite to use get_content_charset().
author: Barry Warsaw <barry@python.org> 2002-09-26 17:19:34 (GMT)
committer: Barry Warsaw <barry@python.org> 2002-09-26 17:19:34 (GMT)
commit: 15aefa94d065cbb7408484ff98406cffd5002e2b (patch)
tree: 6aab24ba6555383e55ed566d72459d5be5769955 /Lib/email
parent: 9b1a80baf407e5a0bee40e28357d35e64263233e (diff)
download: cpython-15aefa94d065cbb7408484ff98406cffd5002e2b.zip
cpython-15aefa94d065cbb7408484ff98406cffd5002e2b.tar.gz
cpython-15aefa94d065cbb7408484ff98406cffd5002e2b.tar.bz2
1 files changed, 39 insertions, 9 deletions
diff --git a/Lib/email/Message.py b/Lib/email/Message.py
index c018ae7..8bc82a6 100644
--- a/Lib/email/Message.py
+++ b/Lib/email/Message.py
@@ -53,7 +53,7 @@ def _formatparam(param, value=None, quote=1):
 
 def _unquotevalue(value):
     if isinstance(value, TupleType):
-        return (value[0], value[1], Utils.unquote(value[2]))
+        return value[0], value[1], Utils.unquote(value[2])
     else:
         return Utils.unquote(value)
 
@@ -509,8 +509,8 @@ class Message:
         The elements of the returned list are 2-tuples of key/value pairs, as
         split on the `=' sign.  The left hand side of the `=' is the key,
         while the right hand side is the value.  If there is no `=' sign in
-        the parameter the value is the empty string.  The value is always
-        unquoted, unless unquote is set to a false value.
+        the parameter the value is the empty string.  The value is as
+        described in the get_param() method.
 
         Optional failobj is the object to return if there is no Content-Type:
         header.  Optional header is the header to search instead of
@@ -529,11 +529,23 @@ class Message:
         """Return the parameter value if found in the Content-Type: header.
 
         Optional failobj is the object to return if there is no Content-Type:
-        header.  Optional header is the header to search instead of
-        Content-Type:
-
-        Parameter keys are always compared case insensitively.  Values are
-        always unquoted, unless unquote is set to a false value.
+        header, or the Content-Type header has no such parameter.  Optional
+        header is the header to search instead of Content-Type:
+
+        Parameter keys are always compared case insensitively.  The return
+        value can either be a string, or a 3-tuple if the parameter was RFC
+        2231 encoded.  When it's a 3-tuple, the elements of the value are of
+        the form (CHARSET, LANGUAGE, VALUE), where LANGUAGE may be the empty
+        string.  Your application should be prepared to deal with these, and
+        can convert the parameter to a Unicode string like so:
+
+            param = msg.get_param('foo')
+            if isinstance(param, tuple):
+                param = unicode(param[2], param[0])
+
+        In any case, the parameter value (either the returned string, or the
+        VALUE item in the 3-tuple) is always unquoted, unless unquote is set
+        to a false value.
         """
         if not self.has_key(header):
             return failobj
@@ -674,6 +686,9 @@ class Message:
         boundary = self.get_param('boundary', missing)
         if boundary is missing:
             return failobj
+        if isinstance(boundary, TupleType):
+            # RFC 2231 encoded, so decode.  It better end up as ascii
+            return unicode(boundary[2], boundary[0]).encode('us-ascii')
         return _unquotevalue(boundary.strip())
 
     def set_boundary(self, boundary):
@@ -727,6 +742,21 @@ class Message:
         # Must be using Python 2.1
         from email._compat21 import walk
 
+    def get_content_charset(self, failobj=None):
+        """Return the charset parameter of the Content-Type header.
+
+        If there is no Content-Type header, or if that header has no charset
+        parameter, failobj is returned.
+        """
+        missing = []
+        charset = self.get_param('charset', missing)
+        if charset is missing:
+            return failobj
+        if isinstance(charset, TupleType):
+            # RFC 2231 encoded, so decode it, and it better end up as ascii.
+            return unicode(charset[2], charset[0]).encode('us-ascii')
+        return charset
+
     def get_charsets(self, failobj=None):
         """Return a list containing the charset(s) used in this message.
 
@@ -743,4 +773,4 @@ class Message:
         one for the container message (i.e. self), so that a non-multipart
         message will still return a list of length 1.
         """
-        return [part.get_param('charset', failobj) for part in self.walk()]
+        return [part.get_content_charset(failobj) for part in self.walk()]
author	Barry Warsaw <barry@python.org>	2002-09-26 17:19:34 (GMT)
committer	Barry Warsaw <barry@python.org>	2002-09-26 17:19:34 (GMT)
commit	15aefa94d065cbb7408484ff98406cffd5002e2b (patch)
tree	6aab24ba6555383e55ed566d72459d5be5769955 /Lib/email
parent	9b1a80baf407e5a0bee40e28357d35e64263233e (diff)
download	cpython-15aefa94d065cbb7408484ff98406cffd5002e2b.zip cpython-15aefa94d065cbb7408484ff98406cffd5002e2b.tar.gz cpython-15aefa94d065cbb7408484ff98406cffd5002e2b.tar.bz2