diff options
author | Barry Warsaw <barry@python.org> | 2002-04-10 21:01:31 (GMT) |
---|---|---|
committer | Barry Warsaw <barry@python.org> | 2002-04-10 21:01:31 (GMT) |
commit | 409a4c08b545aa064cf8fe3b8de51404756a301e (patch) | |
tree | 06cf8fe44e1fe28fbc0147635ec41961f2df6515 /Lib/email | |
parent | 68e69338ae19c37bd3e69cb76e107bfa76231e06 (diff) | |
download | cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.zip cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.gz cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.bz2 |
Sync'ing with standalone email package 2.0.1. This adds support for
non-us-ascii character sets in headers and bodies. Some API changes
(with DeprecationWarnings for the old APIs). Better RFC-compliant
implementations of base64 and quoted-printable.
Updated test cases. Documentation updates to follow (after I finish
writing them ;).
Diffstat (limited to 'Lib/email')
-rw-r--r-- | Lib/email/Charset.py | 327 | ||||
-rw-r--r-- | Lib/email/Encoders.py | 10 | ||||
-rw-r--r-- | Lib/email/Errors.py | 2 | ||||
-rw-r--r-- | Lib/email/Generator.py | 54 | ||||
-rw-r--r-- | Lib/email/Header.py | 210 | ||||
-rw-r--r-- | Lib/email/Iterators.py | 2 | ||||
-rw-r--r-- | Lib/email/MIMEBase.py | 2 | ||||
-rw-r--r-- | Lib/email/MIMEImage.py | 2 | ||||
-rw-r--r-- | Lib/email/MIMEMessage.py | 2 | ||||
-rw-r--r-- | Lib/email/MIMEText.py | 35 | ||||
-rw-r--r-- | Lib/email/Message.py | 223 | ||||
-rw-r--r-- | Lib/email/Parser.py | 26 | ||||
-rw-r--r-- | Lib/email/Utils.py | 143 | ||||
-rw-r--r-- | Lib/email/__init__.py | 10 | ||||
-rw-r--r-- | Lib/email/base64MIME.py | 174 | ||||
-rw-r--r-- | Lib/email/quopriMIME.py | 312 |
16 files changed, 1438 insertions, 96 deletions
diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py new file mode 100644 index 0000000..4874597 --- /dev/null +++ b/Lib/email/Charset.py @@ -0,0 +1,327 @@ +# Copyright (C) 2001,2002 Python Software Foundation +# Author: che@debian.org (Ben Gertzfield) + +from types import UnicodeType +from email.Encoders import encode_7or8bit +import email.base64MIME +import email.quopriMIME + + + +# Flags for types of header encodings +QP = 1 # Quoted-Printable +BASE64 = 2 # Base64 + +# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 +MISC_LEN = 7 + +DEFAULT_CHARSET = 'us-ascii' + + + +# Defaults +CHARSETS = { + # input header enc body enc output conv + 'iso-8859-1': (QP, QP, None), + 'iso-8859-2': (QP, QP, None), + 'us-ascii': (None, None, None), + 'big5': (BASE64, BASE64, None), + 'gb2312': (BASE64, BASE64, None), + 'euc-jp': (BASE64, None, 'iso-2022-jp'), + 'shift_jis': (BASE64, None, 'iso-2022-jp'), + 'iso-2022-jp': (BASE64, None, None), + 'koi8-r': (BASE64, BASE64, None), + 'utf-8': (BASE64, BASE64, 'utf-8'), + } + +# Aliases for other commonly-used names for character sets. Map +# them to the real ones used in email. +ALIASES = { + 'latin_1': 'iso-8859-1', + 'latin-1': 'iso-8859-1', + 'ascii': 'us-ascii', + } + +# Map charsets to their Unicode codec strings. Note that the Japanese +# examples included below do not (yet) come with Python! They are available +# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/ + +# The Chinese and Korean codecs are available from SourceForge: +# +# http://sourceforge.net/projects/python-codecs/ +# +# although you'll need to check them out of cvs since they haven't been file +# released yet. You might also try to use +# +# http://www.freshports.org/port-description.php3?port=6702 +# +# if you can get logged in. AFAICT, both the Chinese and Korean codecs are +# fairly experimental at this point. +CODEC_MAP = { + 'euc-jp': 'japanese.euc-jp', + 'iso-2022-jp': 'japanese.iso-2022-jp', + 'shift_jis': 'japanese.shift_jis', + 'gb2132': 'eucgb2312_cn', + 'big5': 'big5_tw', + 'utf-8': 'utf-8', + # Hack: We don't want *any* conversion for stuff marked us-ascii, as all + # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. + # Let that stuff pass through without conversion to/from Unicode. + 'us-ascii': None, + } + + + +# Convenience functions for extending the above mappings +def add_charset(charset, header_enc=None, body_enc=None, output_charset=None): + """Add charset properties to the global map. + + charset is the input character set, and must be the canonical name of a + character set. + + Optional header_enc and body_enc is either Charset.QP for + quoted-printable, Charset.BASE64 for base64 encoding, or None for no + encoding. It describes how message headers and message bodies in the + input charset are to be encoded. Default is no encoding. + + Optional output_charset is the character set that the output should be + in. Conversions will proceed from input charset, to Unicode, to the + output charset when the method Charset.convert() is called. The default + is to output in the same character set as the input. + + Both input_charset and output_charset must have Unicode codec entries in + the module's charset-to-codec mapping; use add_codec(charset, codecname) + to add codecs the module does not know about. See the codec module's + documentation for more information. + """ + CHARSETS[charset] = (header_enc, body_enc, output_charset) + + +def add_alias(alias, canonical): + """Add a character set alias. + + alias is the alias name, e.g. latin-1 + canonical is the character set's canonical name, e.g. iso-8859-1 + """ + ALIASES[alias] = canonical + + +def add_codec(charset, codecname): + """Add a codec that map characters in the given charset to/from Unicode. + + charset is the canonical name of a character set. codecname is the name + of a Python codec, as appropriate for the second argument to the unicode() + built-in, or to the .encode() method of a Unicode string. + """ + CODEC_MAP[charset] = codecname + + + +class Charset: + """Map character sets to their email properties. + + This class provides information about the requirements imposed on email + for a specific character set. It also provides convenience routines for + converting between character sets, given the availability of the + applicable codecs. Given an character set, it will do its best to provide + information on how to use that character set in an email. + + Certain character sets must be encoded with quoted-printable or base64 + when used in email headers or bodies. Certain character sets must be + converted outright, and are not allowed in email. Instances of this + module expose the following information about a character set: + + input_charset: The initial character set specified. Common aliases + are converted to their `official' email names (e.g. latin_1 + is converted to iso-8859-1). Defaults to 7-bit us-ascii. + + header_encoding: If the character set must be encoded before it can be + used in an email header, this attribute will be set to + Charset.QP (for quoted-printable) or Charset.BASE64 (for + base64 encoding). Otherwise, it will be None. + + body_encoding: Same as header_encoding, but describes the encoding for the + mail message's body, which indeed may be different than the + header encoding. + + output_charset: Some character sets must be converted before the can be + used in email headers or bodies. If the input_charset is + one of them, this attribute will contain the name of the + charset output will be converted to. Otherwise, it will + be None. + + input_codec: The name of the Python codec used to convert the + input_charset to Unicode. If no conversion codec is + necessary, this attribute will be None. + + output_codec: The name of the Python codec used to convert Unicode + to the output_charset. If no conversion codec is necessary, + this attribute will have the same value as the input_codec. + """ + def __init__(self, input_charset=DEFAULT_CHARSET): + # Set the input charset after filtering through the aliases + self.input_charset = ALIASES.get(input_charset, input_charset) + # We can try to guess which encoding and conversion to use by the + # charset_map dictionary. Try that first, but let the user override + # it. + henc, benc, conv = CHARSETS.get(self.input_charset, + (BASE64, BASE64, None)) + # Set the attributes, allowing the arguments to override the default. + self.header_encoding = henc + self.body_encoding = benc + self.output_charset = ALIASES.get(conv, conv) + # Now set the codecs. If one isn't defined for input_charset, + # guess and try a Unicode codec with the same name as input_codec. + self.input_codec = CODEC_MAP.get(self.input_charset, + self.input_charset) + self.output_codec = CODEC_MAP.get(self.output_charset, + self.input_codec) + + def __str__(self): + return self.input_charset.lower() + + def __eq__(self, other): + return str(self) == str(other).lower() + + def __ne__(self, other): + return not self.__eq__(other) + + def get_body_encoding(self): + """Return the content-transfer-encoding used for body encoding. + + This is either the string `quoted-printable' or `base64' depending on + the encoding used, or it is a function in which case you should call + the function with a single argument, the Message object being + encoded. The function should then set the Content-Transfer-Encoding: + header itself to whatever is appropriate. + + Returns "quoted-printable" if self.body_encoding is QP. + Returns "base64" if self.body_encoding is BASE64. + Returns "7bit" otherwise. + """ + if self.body_encoding == QP: + return 'quoted-printable' + elif self.body_encoding == BASE64: + return 'base64' + else: + return encode_7or8bit + + def convert(self, s): + """Convert a string from the input_codec to the output_codec.""" + if self.input_codec <> self.output_codec: + return unicode(s, self.input_codec).encode(self.output_codec) + else: + return s + + def to_splittable(self, s): + """Convert a possibly multibyte string to a safely splittable format. + + Uses the input_codec to try and convert the string to Unicode, so it + can be safely split on character boundaries (even for double-byte + characters). + + Returns the string untouched if we don't know how to convert it to + Unicode with the input_charset. + + Characters that could not be converted to Unicode will be replaced + with the Unicode replacement character U+FFFD. + """ + if isinstance(s, UnicodeType) or self.input_codec is None: + return s + try: + return unicode(s, self.input_codec, 'replace') + except LookupError: + # Input codec not installed on system, so return the original + # string unchanged. + return s + + def from_splittable(self, ustr, to_output=1): + """Convert a splittable string back into an encoded string. + + Uses the proper codec to try and convert the string from + Unicode back into an encoded format. Return the string as-is + if it is not Unicode, or if it could not be encoded from + Unicode. + + Characters that could not be converted from Unicode will be replaced + with an appropriate character (usually '?'). + + If to_output is true, uses output_codec to convert to an encoded + format. If to_output is false, uses input_codec. to_output defaults + to 1. + """ + if to_output: + codec = self.output_codec + else: + codec = self.input_codec + if not isinstance(ustr, UnicodeType) or codec is None: + return ustr + try: + return ustr.encode(codec, 'replace') + except LookupError: + # Output codec not installed + return ustr + + def get_output_charset(self): + """Return the output character set. + + This is self.output_charset if that is set, otherwise it is + self.input_charset. + """ + return self.output_charset or self.input_charset + + def encoded_header_len(self, s): + """Return the length of the encoded header string.""" + cset = self.get_output_charset() + # The len(s) of a 7bit encoding is len(s) + if self.header_encoding is BASE64: + return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN + elif self.header_encoding is QP: + return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN + else: + return len(s) + + def header_encode(self, s, convert=0): + """Header-encode a string, optionally converting it to output_charset. + + If convert is true, the string will be converted from the input + charset to the output charset automatically. This is not useful for + multibyte character sets, which have line length issues (multibyte + characters must be split on a character, not a byte boundary); use the + high-level Header class to deal with these issues. convert defaults + to 0. + + The type of encoding (base64 or quoted-printable) will be based on + self.header_encoding. + """ + cset = self.get_output_charset() + if convert: + s = self.convert(s) + # 7bit/8bit encodings return the string unchanged (modulo conversions) + if self.header_encoding is BASE64: + return email.base64MIME.header_encode(s, cset) + elif self.header_encoding is QP: + return email.quopriMIME.header_encode(s, cset) + else: + return s + + def body_encode(self, s, convert=1): + """Body-encode a string and convert it to output_charset. + + If convert is true (the default), the string will be converted from + the input charset to output charset automatically. Unlike + header_encode(), there are no issues with byte boundaries and + multibyte charsets in email bodies, so this is usually pretty safe. + + The type of encoding (base64 or quoted-printable) will be based on + self.body_encoding. + """ + if convert: + s = self.convert(s) + # 7bit/8bit encodings return the string unchanged (module conversions) + if self.body_encoding is BASE64: + return email.base64MIME.body_encode(s) + elif self.header_encoding is QP: + return email.quopriMIME.body_encode(s) + else: + return s diff --git a/Lib/email/Encoders.py b/Lib/email/Encoders.py index d9cd42d..f09affa 100644 --- a/Lib/email/Encoders.py +++ b/Lib/email/Encoders.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Module containing encoding functions for Image.Image and Text.Text. @@ -11,7 +11,9 @@ from quopri import encodestring as _encodestring # Helpers def _qencode(s): - return _encodestring(s, quotetabs=1) + enc = _encodestring(s, quotetabs=1) + # Must encode spaces, which quopri.encodestring() doesn't do + return enc.replace(' ', '=20') def _bencode(s): @@ -54,6 +56,10 @@ def encode_quopri(msg): def encode_7or8bit(msg): """Set the Content-Transfer-Encoding: header to 7bit or 8bit.""" orig = msg.get_payload() + if orig is None: + # There's no payload. For backwards compatibility we use 7bit + msg['Content-Transfer-Encoding'] = '7bit' + return # We play a trick to make this go fast. If encoding to ASCII succeeds, we # know the data must be 7bit, otherwise treat it as 8bit. try: diff --git a/Lib/email/Errors.py b/Lib/email/Errors.py index 71d7663..e3a3666 100644 --- a/Lib/email/Errors.py +++ b/Lib/email/Errors.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """email package exception classes. diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py index 981e0ff..dbbcabc 100644 --- a/Lib/email/Generator.py +++ b/Lib/email/Generator.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Classes to generate plain text from a message object tree. @@ -166,30 +166,33 @@ class Generator: return text rtn = [] for line in text.split('\n'): + splitline = [] # Short lines can remain unchanged if len(line.replace('\t', SPACE8)) <= maxheaderlen: - rtn.append(line) - SEMINLTAB.join(rtn) + splitline.append(line) + rtn.append(SEMINLTAB.join(splitline)) else: - oldlen = len(text) + oldlen = len(line) # Try to break the line on semicolons, but if that doesn't # work, try to split on folding whitespace. - while len(text) > maxheaderlen: - i = text.rfind(';', 0, maxheaderlen) + while len(line) > maxheaderlen: + i = line.rfind(';', 0, maxheaderlen) if i < 0: break - rtn.append(text[:i]) - text = text[i+1:].lstrip() - if len(text) <> oldlen: + splitline.append(line[:i]) + line = line[i+1:].lstrip() + if len(line) <> oldlen: # Splitting on semis worked - rtn.append(text) - return SEMINLTAB.join(rtn) + splitline.append(line) + rtn.append(SEMINLTAB.join(splitline)) + continue # Splitting on semis didn't help, so try to split on # whitespace. - parts = re.split(r'(\s+)', text) + parts = re.split(r'(\s+)', line) # Watch out though for "Header: longnonsplittableline" if parts[0].endswith(':') and len(parts) == 3: - return text + rtn.append(line) + continue first = parts.pop(0) sublines = [first] acc = len(first) @@ -203,13 +206,14 @@ class Generator: else: # Split it here, but don't forget to ignore the # next whitespace-only part - rtn.append(EMPTYSTRING.join(sublines)) + splitline.append(EMPTYSTRING.join(sublines)) del parts[0] first = parts.pop(0) sublines = [first] acc = len(first) - rtn.append(EMPTYSTRING.join(sublines)) - return NLTAB.join(rtn) + splitline.append(EMPTYSTRING.join(sublines)) + rtn.append(NLTAB.join(splitline)) + return NL.join(rtn) # # Handlers for writing types and subtypes @@ -219,6 +223,9 @@ class Generator: payload = msg.get_payload() if payload is None: return + cset = msg.get_charset() + if cset is not None: + payload = cset.body_encode(payload) if not isinstance(payload, StringType): raise TypeError, 'string payload expected: %s' % type(payload) if self._mangle_from_: @@ -233,7 +240,18 @@ class Generator: # together, and then make sure that the boundary we've chosen isn't # present in the payload. msgtexts = [] - for part in msg.get_payload(): + subparts = msg.get_payload() + if subparts is None: + # Nothing has every been attached + boundary = msg.get_boundary(failobj=_make_boundary()) + print >> self._fp, '--' + boundary + print >> self._fp, '\n' + print >> self._fp, '--' + boundary + '--' + return + elif not isinstance(subparts, ListType): + # Scalar payload + subparts = [subparts] + for part in subparts: s = StringIO() g = self.__class__(s, self._mangle_from_, self.__maxheaderlen) g(part, unixfrom=0) @@ -365,7 +383,7 @@ class DecodedGenerator(Generator): # Helper -def _make_boundary(self, text=None): +def _make_boundary(text=None): # Craft a random boundary. If text is given, ensure that the chosen # boundary doesn't appear in the text. boundary = ('=' * 15) + repr(random.random()).split('.')[1] + '==' diff --git a/Lib/email/Header.py b/Lib/email/Header.py new file mode 100644 index 0000000..097b978 --- /dev/null +++ b/Lib/email/Header.py @@ -0,0 +1,210 @@ +# Copyright (C) 2002 Python Software Foundation +# Author: che@debian.org (Ben Gertzfield) + +"""Header encoding and decoding functionality.""" + +import re +import email.quopriMIME +import email.base64MIME +from email.Charset import Charset + +CRLFSPACE = '\r\n ' +CRLF = '\r\n' +NLSPACE = '\n ' + +MAXLINELEN = 76 + +ENCODE = 1 +DECODE = 2 + +# Match encoded-word strings in the form =?charset?q?Hello_World?= +ecre = re.compile(r''' + =\? # literal =? + (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset + \? # literal ? + (?P<encoding>[qb]) # either a "q" or a "b", case insensitive + \? # literal ? + (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string + \?= # literal ?= + ''', re.VERBOSE | re.IGNORECASE) + + + +# Helpers +_max_append = email.quopriMIME._max_append + + + +def decode_header(header): + """Decode a message header value without converting charset. + + Returns a list of (decoded_string, charset) pairs containing each of the + decoded parts of the header. Charset is None for non-encoded parts of the + header, otherwise a lower-case string containing the name of the character + set specified in the encoded string. + """ + # If no encoding, just return the header + header = str(header) + if not ecre.search(header): + return [(header, None)] + + decoded = [] + dec = '' + for line in header.splitlines(): + # This line might not have an encoding in it + if not ecre.search(line): + decoded.append((line, None)) + continue + + parts = ecre.split(line) + while parts: + unenc = parts.pop(0).strip() + if unenc: + # Should we continue a long line? + if decoded and decoded[-1][1] is None: + decoded[-1] = (decoded[-1][0] + dec, None) + else: + decoded.append((unenc, None)) + if parts: + charset, encoding = [s.lower() for s in parts[0:2]] + encoded = parts[2] + dec = '' + if encoding == 'q': + dec = email.quopriMIME.header_decode(encoded) + elif encoding == 'b': + dec = email.base64MIME.decode(encoded) + else: + dec = encoded + + if decoded and decoded[-1][1] == charset: + decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1]) + else: + decoded.append((dec, charset)) + del parts[0:3] + return decoded + + + +class Header: + def __init__(self, s, charset=None, maxlinelen=MAXLINELEN, + header_name=None): + """Create a MIME-compliant header that can contain many languages. + + Specify the initial header value in s. Specify its character set as a + Charset object in the charset argument. If none, a default Charset + instance will be used. + + You can later append to the header with append(s, charset) below; + charset does not have to be the same as the one initially specified + here. In fact, it's optional, and if not given, defaults to the + charset specified in the constructor. + + The maximum line length can either be specified by maxlinelen, or you + can pass in the name of the header field (e.g. "Subject") to let this + class guess the best line length to use to prevent wrapping. The + default maxlinelen is 76. + """ + if charset is None: + charset = Charset() + self._charset = charset + # BAW: I believe `chunks' and `maxlinelen' should be non-public. + self._chunks = [] + self.append(s, charset) + self._maxlinelen = maxlinelen + if header_name is not None: + self.guess_maxlinelen(header_name) + + def __str__(self): + """A synonym for self.encode().""" + return self.encode() + + def guess_maxlinelen(self, s=None): + """Guess the maximum length to make each header line. + + Given a header name (e.g. "Subject"), set this header's maximum line + length to an appropriate length to avoid line wrapping. If s is not + given, return the previous maximum line length and don't set it. + + Returns the new maximum line length. + """ + # BAW: is this semantic necessary? + if s is not None: + self._maxlinelen = MAXLINELEN - len(s) - 2 + return self._maxlinelen + + def append(self, s, charset=None): + """Append string s with Charset charset to the MIME header. + + charset defaults to the one given in the class constructor. + """ + if charset is None: + charset = self._charset + self._chunks.append((s, charset)) + + def _split(self, s, charset): + # Split up a header safely for use with encode_chunks. BAW: this + # appears to be a private convenience method. + splittable = charset.to_splittable(s) + encoded = charset.from_splittable(splittable) + + if charset.encoded_header_len(encoded) < self._maxlinelen: + return [(encoded, charset)] + else: + # Divide and conquer. BAW: halfway depends on integer division. + # When porting to Python 2.2, use the // operator. + halfway = len(splittable) // 2 + first = charset.from_splittable(splittable[:halfway], 0) + last = charset.from_splittable(splittable[halfway:], 0) + return self._split(first, charset) + self._split(last, charset) + + def encode(self): + """Encode a message header, possibly converting charset and encoding. + + There are many issues involved in converting a given string for use in + an email header. Only certain character sets are readable in most + email clients, and as header strings can only contain a subset of + 7-bit ASCII, care must be taken to properly convert and encode (with + Base64 or quoted-printable) header strings. In addition, there is a + 75-character length limit on any given encoded header field, so + line-wrapping must be performed, even with double-byte character sets. + + This method will do its best to convert the string to the correct + character set used in email, and encode and line wrap it safely with + the appropriate scheme for that character set. + + If the given charset is not known or an error occurs during + conversion, this function will return the header untouched. + """ + newchunks = [] + for s, charset in self._chunks: + newchunks += self._split(s, charset) + self._chunks = newchunks + return self.encode_chunks() + + def encode_chunks(self): + """MIME-encode a header with many different charsets and/or encodings. + + Given a list of pairs (string, charset), return a MIME-encoded string + suitable for use in a header field. Each pair may have different + charsets and/or encodings, and the resulting header will accurately + reflect each setting. + + Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like + character sets like iso-8859-1), email.Utils.BASE64 (Base64, for + non-ASCII like character sets like KOI8-R and iso-2022-jp), or None + (no encoding). + + Each pair will be represented on a separate line; the resulting string + will be in the format: + + "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n + =?charset2?b?SvxyZ2VuIEL2aW5n?=" + """ + chunks = [] + for header, charset in self._chunks: + if charset is None: + _max_append(chunks, header, self._maxlinelen, ' ') + else: + _max_append(chunks, charset.header_encode(header, 0), + self._maxlinelen, ' ') + return NLSPACE.join(chunks) diff --git a/Lib/email/Iterators.py b/Lib/email/Iterators.py index a64495d..515bac9 100644 --- a/Lib/email/Iterators.py +++ b/Lib/email/Iterators.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Various types of useful iterators and generators. diff --git a/Lib/email/MIMEBase.py b/Lib/email/MIMEBase.py index 33216f6..28816e8 100644 --- a/Lib/email/MIMEBase.py +++ b/Lib/email/MIMEBase.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Base class for MIME specializations. diff --git a/Lib/email/MIMEImage.py b/Lib/email/MIMEImage.py index 963da23..f0e7931a 100644 --- a/Lib/email/MIMEImage.py +++ b/Lib/email/MIMEImage.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Class representing image/* type MIME documents. diff --git a/Lib/email/MIMEMessage.py b/Lib/email/MIMEMessage.py index fc4b2c6..89da925 100644 --- a/Lib/email/MIMEMessage.py +++ b/Lib/email/MIMEMessage.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Class representing message/* MIME documents. diff --git a/Lib/email/MIMEText.py b/Lib/email/MIMEText.py index ccce9fb..8669d28 100644 --- a/Lib/email/MIMEText.py +++ b/Lib/email/MIMEText.py @@ -1,9 +1,10 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Class representing text/* type MIME documents. """ +import warnings import MIMEBase from Encoders import encode_7or8bit @@ -13,7 +14,7 @@ class MIMEText(MIMEBase.MIMEBase): """Class for generating text/* type MIME documents.""" def __init__(self, _text, _subtype='plain', _charset='us-ascii', - _encoder=encode_7or8bit): + _encoder=None): """Create a text/* type MIME document. _text is the string for this message object. If the text does not end @@ -22,20 +23,26 @@ class MIMEText(MIMEBase.MIMEBase): _subtype is the MIME sub content type, defaulting to "plain". _charset is the character set parameter added to the Content-Type: - header. This defaults to "us-ascii". - - _encoder is a function which will perform the actual encoding for - transport of the text data. It takes one argument, which is this - Text instance. It should use get_payload() and set_payload() to - change the payload to the encoded form. It should also add any - Content-Transfer-Encoding: or other headers to the message as - necessary. The default encoding doesn't actually modify the payload, - but it does set Content-Transfer-Encoding: to either `7bit' or `8bit' - as appropriate. + header. This defaults to "us-ascii". Note that as a side-effect, the + Content-Transfer-Encoding: header will also be set. + + The use of the _encoder is deprecated. The encoding of the payload, + and the setting of the character set parameter now happens implicitly + based on the _charset argument. If _encoder is supplied, then a + DeprecationWarning is used, and the _encoder functionality may + override any header settings indicated by _charset. This is probably + not what you want. """ MIMEBase.MIMEBase.__init__(self, 'text', _subtype, **{'charset': _charset}) if _text and _text[-1] <> '\n': _text += '\n' - self.set_payload(_text) - _encoder(self) + self.set_payload(_text, _charset) + if _encoder is not None: + warnings.warn('_encoder argument is obsolete.', + DeprecationWarning, 2) + # Because set_payload() with a _charset will set its own + # Content-Transfer-Encoding: header, we need to delete the + # existing one or will end up with two of them. :( + del self['content-transfer-encoding'] + _encoder(self) diff --git a/Lib/email/Message.py b/Lib/email/Message.py index 91931a1..71d10c4 100644 --- a/Lib/email/Message.py +++ b/Lib/email/Message.py @@ -1,23 +1,47 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Basic message object for the email package object model. """ -from __future__ import generators - import re -import base64 -import quopri +import warnings from cStringIO import StringIO -from types import ListType +from types import ListType, StringType # Intrapackage imports import Errors import Utils +import Charset SEMISPACE = '; ' + +# Regular expression used to split header parameters. BAW: this may be too +# simple. It isn't strictly RFC 2045 (section 5.1) compliant, but it catches +# most headers found in the wild. We may eventually need a full fledged +# parser eventually. paramre = re.compile(r'\s*;\s*') +# Regular expression that matches `special' characters in parameters, the +# existance of which force quoting of the parameter value. +tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]') + + + +# Helper function +def _formatparam(param, value=None, quote=1): + """Convenience function to format and return a key=value pair. + + Will quote the value if needed or if quote is true. + """ + if value is not None and len(value) > 0: + # BAW: Please check this. I think that if quote is set it should + # force quoting even if not necessary. + if quote or tspecials.search(value): + return '%s="%s"' % (param, Utils.quote(value)) + else: + return '%s=%s' % (param, value) + else: + return param @@ -39,6 +63,7 @@ class Message: self._headers = [] self._unixfrom = None self._payload = None + self._charset = None # Defaults for multipart messages self.preamble = self.epilogue = None @@ -83,6 +108,8 @@ class Message: If the current payload is empty, then the current payload will be made a scalar, set to the given value. """ + warnings.warn('add_payload() is deprecated, use attach() instead.', + DeprecationWarning, 2) if self._payload is None: self._payload = payload elif type(self._payload) is ListType: @@ -93,8 +120,18 @@ class Message: else: self._payload = [self._payload, payload] - # A useful synonym - attach = add_payload + def attach(self, payload): + """Add the given payload to the current payload. + + The current payload will always be a list of objects after this method + is called. If you want to set the payload to a scalar object + (e.g. because you're attaching a message/rfc822 subpart), use + set_payload() instead. + """ + if self._payload is None: + self._payload = [payload] + else: + self._payload.append(payload) def get_payload(self, i=None, decode=0): """Return the current payload exactly as is. @@ -128,10 +165,58 @@ class Message: return payload - def set_payload(self, payload): - """Set the payload to the given value.""" + def set_payload(self, payload, charset=None): + """Set the payload to the given value. + + Optionally set the charset, which must be a Charset instance.""" self._payload = payload + if charset is not None: + self.set_charset(charset) + + def set_charset(self, charset): + """Set the charset of the payload to a given character set. + + charset can be a string or a Charset object. If it is a string, it + will be converted to a Charset object by calling Charset's + constructor. If charset is None, the charset parameter will be + removed from the Content-Type: field. Anything else will generate a + TypeError. + + The message will be assumed to be a text message encoded with + charset.input_charset. It will be converted to charset.output_charset + and encoded properly, if needed, when generating the plain text + representation of the message. MIME headers (MIME-Version, + Content-Type, Content-Transfer-Encoding) will be added as needed. + """ + if charset is None: + self.del_param('charset') + self._charset = None + return + if isinstance(charset, StringType): + charset = Charset.Charset(charset) + if not isinstance(charset, Charset.Charset): + raise TypeError, charset + # BAW: should we accept strings that can serve as arguments to the + # Charset constructor? + self._charset = charset + if not self.has_key('MIME-Version'): + self.add_header('MIME-Version', '1.0') + if not self.has_key('Content-Type'): + self.add_header('Content-Type', 'text/plain', + charset=charset.get_output_charset()) + else: + self.set_param('charset', charset.get_output_charset()) + if not self.has_key('Content-Transfer-Encoding'): + cte = charset.get_body_encoding() + if callable(cte): + cte(self) + else: + self.add_header('Content-Transfer-Encoding', cte) + def get_charset(self): + """Return the Charset object associated with the message's payload.""" + return self._charset + # # MAPPING INTERFACE (partial) # @@ -257,7 +342,7 @@ class Message: if v is None: parts.append(k.replace('_', '-')) else: - parts.append('%s="%s"' % (k.replace('_', '-'), v)) + parts.append(_formatparam(k.replace('_', '-'), v)) if _value is not None: parts.insert(0, _value) self._headers.append((_name, SEMISPACE.join(parts))) @@ -308,6 +393,8 @@ class Message: for p in paramre.split(value): try: name, val = p.split('=', 1) + name = name.rstrip() + val = val.lstrip() except ValueError: # Must have been a bare attribute name = p @@ -315,26 +402,29 @@ class Message: params.append((name, val)) return params - def get_params(self, failobj=None, header='content-type'): + def get_params(self, failobj=None, header='content-type', unquote=1): """Return the message's Content-Type: parameters, as a list. The elements of the returned list are 2-tuples of key/value pairs, as split on the `=' sign. The left hand side of the `=' is the key, while the right hand side is the value. If there is no `=' sign in the parameter the value is the empty string. The value is always - unquoted. + unquoted, unless unquote is set to a false value. Optional failobj is the object to return if there is no Content-Type: header. Optional header is the header to search instead of - Content-Type: + Content-Type:. """ missing = [] params = self._get_params_preserve(missing, header) if params is missing: return failobj - return [(k, Utils.unquote(v)) for k, v in params] + if unquote: + return [(k, Utils.unquote(v)) for k, v in params] + else: + return params - def get_param(self, param, failobj=None, header='content-type'): + def get_param(self, param, failobj=None, header='content-type', unquote=1): """Return the parameter value if found in the Content-Type: header. Optional failobj is the object to return if there is no Content-Type: @@ -342,15 +432,112 @@ class Message: Content-Type: Parameter keys are always compared case insensitively. Values are - always unquoted. + always unquoted, unless unquote is set to a false value. """ if not self.has_key(header): return failobj for k, v in self._get_params_preserve(failobj, header): if k.lower() == param.lower(): - return Utils.unquote(v) + if unquote: + return Utils.unquote(v) + else: + return v return failobj + def set_param(self, param, value, header='Content-Type', requote=1): + """Set a parameter in the Content-Type: header. + + If the parameter already exists in the header, its value will be + replaced with the new value. + + If header is Content-Type: and has not yet been defined in this + message, it will be set to "text/plain" and the new parameter and + value will be appended, as per RFC 2045. + + An alternate header can specified in the header argument, and + all parameters will be quoted as appropriate unless requote is + set to a false value. + """ + if not self.has_key(header) and header.lower() == 'content-type': + ctype = 'text/plain' + else: + ctype = self.get(header) + if not self.get_param(param, header=header): + if not ctype: + ctype = _formatparam(param, value, requote) + else: + ctype = SEMISPACE.join( + [ctype, _formatparam(param, value, requote)]) + else: + ctype = '' + for old_param, old_value in self.get_params(header=header, + unquote=requote): + append_param = '' + if old_param.lower() == param.lower(): + append_param = _formatparam(param, value, requote) + else: + append_param = _formatparam(old_param, old_value, requote) + if not ctype: + ctype = append_param + else: + ctype = SEMISPACE.join([ctype, append_param]) + if ctype <> self.get(header): + del self[header] + self[header] = ctype + + def del_param(self, param, header='content-type', requote=1): + """Remove the given parameter completely from the Content-Type header. + + The header will be re-written in place without param or its value. + All values will be quoted as appropriate unless requote is set to a + false value. + """ + if not self.has_key(header): + return + new_ctype = '' + for p, v in self.get_params(header, unquote=requote): + if p.lower() <> param.lower(): + if not new_ctype: + new_ctype = _formatparam(p, v, requote) + else: + new_ctype = SEMISPACE.join([new_ctype, + _formatparam(p, v, requote)]) + if new_ctype <> self.get(header): + del self[header] + self[header] = new_ctype + + def set_type(self, type, header='Content-Type', requote=1): + """Set the main type and subtype for the Content-Type: header. + + type must be a string in the form "maintype/subtype", otherwise a + ValueError is raised. + + This method replaces the Content-Type: header, keeping all the + parameters in place. If requote is false, this leaves the existing + header's quoting as is. Otherwise, the parameters will be quoted (the + default). + + An alternate header can be specified in the header argument. When the + Content-Type: header is set, we'll always also add a MIME-Version: + header. + """ + # BAW: should we be strict? + if not type.count('/') == 1: + raise ValueError + # Set the Content-Type: you get a MIME-Version: + if header.lower() == 'content-type': + del self['mime-version'] + self['MIME-Version'] = '1.0' + if not self.has_key(header): + self[header] = type + return + params = self.get_params(header, unquote=requote) + del self[header] + self[header] = type + # Skip the first param; it's the old type. + for p, v in params[1:]: + self.set_param(p, v, header, requote) + def get_filename(self, failobj=None): """Return the filename associated with the payload if present. diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py index 2f131d6..7177dfc 100644 --- a/Lib/email/Parser.py +++ b/Lib/email/Parser.py @@ -51,9 +51,16 @@ class Parser: lastvalue = [] lineno = 0 while 1: - line = fp.readline()[:-1] - if not line or not line.strip(): + # Don't strip the line before we test for the end condition, + # because whitespace-only header lines are RFC compliant + # continuation lines. + line = fp.readline() + if not line: break + line = line.splitlines()[0] + if not line: + break + # Ignore the trailing newline lineno += 1 # Check for initial Unix From_ line if line.startswith('From '): @@ -63,7 +70,6 @@ class Parser: else: raise Errors.HeaderParseError( 'Unix-from in headers after first rfc822 header') - # # Header continuation line if line[0] in ' \t': if not lastheader: @@ -134,11 +140,11 @@ class Parser: msgobj = self.parsestr(part) container.preamble = preamble container.epilogue = epilogue - # Ensure that the container's payload is a list - if not isinstance(container.get_payload(), ListType): - container.set_payload([msgobj]) - else: - container.add_payload(msgobj) + container.attach(msgobj) + elif container.get_main_type() == 'multipart': + # Very bad. A message is a multipart with no boundary! + raise Errors.BoundaryError( + 'multipart message with no defined boundary') elif container.get_type() == 'message/delivery-status': # This special kind of type contains blocks of headers separated # by a blank line. We'll represent each header block as a @@ -160,9 +166,9 @@ class Parser: except Errors.HeaderParseError: msg = self._class() self._parsebody(msg, fp) - container.add_payload(msg) + container.set_payload(msg) else: - container.add_payload(fp.read()) + container.set_payload(fp.read()) diff --git a/Lib/email/Utils.py b/Lib/email/Utils.py index 3d48287..887be55 100644 --- a/Lib/email/Utils.py +++ b/Lib/email/Utils.py @@ -1,16 +1,26 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """Miscellaneous utilities. """ import time +import socket import re +import random +import os +import warnings +from cStringIO import StringIO +from types import ListType -from rfc822 import unquote, quote, parseaddr -from rfc822 import dump_address_pair +from rfc822 import unquote, quote from rfc822 import AddrlistClass as _AddrlistClass -from rfc822 import parsedate_tz, parsedate, mktime_tz +from rfc822 import mktime_tz + +# We need wormarounds for bugs in these methods in older Pythons (see below) +from rfc822 import parsedate as _parsedate +from rfc822 import parsedate_tz as _parsedate_tz +from rfc822 import parseaddr as _parseaddr from quopri import decodestring as _qdecode import base64 @@ -20,6 +30,10 @@ from Encoders import _bencode, _qencode COMMASPACE = ', ' UEMPTYSTRING = u'' +CRLF = '\r\n' + +specialsre = re.compile(r'[][\()<>@,:;".]') +escapesre = re.compile(r'[][\()"]') @@ -44,6 +58,41 @@ def _bdecode(s): +def fix_eols(s): + """Replace all line-ending characters with \r\n.""" + # Fix newlines with no preceding carriage return + s = re.sub(r'(?<!\r)\n', CRLF, s) + # Fix carriage returns with no following newline + s = re.sub(r'\r(?!\n)', CRLF, s) + return s + + + +def formataddr(pair): + """The inverse of parseaddr(), this takes a 2-tuple of the form + (realname, email_address) and returns the string value suitable + for an RFC 2822 From:, To: or Cc:. + + If the first element of pair is false, then the second element is + returned unmodified. + """ + name, address = pair + if name: + quotes = '' + if specialsre.search(name): + quotes = '"' + name = escapesre.sub(r'\\\g<0>', name) + return '%s%s%s <%s>' % (quotes, name, quotes, address) + return address + +# For backwards compatibility +def dump_address_pair(pair): + warnings.warn('Use email.Utils.formataddr() instead', + DeprecationWarning, 2) + return formataddr(pair) + + + def getaddresses(fieldvalues): """Return a list of (REALNAME, EMAIL) for each fieldvalue.""" all = COMMASPACE.join(fieldvalues) @@ -64,30 +113,26 @@ ecre = re.compile(r''' def decode(s): - """Return a decoded string according to RFC 2047, as a unicode string.""" + """Return a decoded string according to RFC 2047, as a unicode string. + + NOTE: This function is deprecated. Use Header.decode_header() instead. + """ + warnings.warn('Use Header.decode_header() instead.', DeprecationWarning, 2) + # Intra-package import here to avoid circular import problems. + from Header import decode_header + L = decode_header(s) + if not isinstance(L, ListType): + # s wasn't decoded + return s + rtn = [] - parts = ecre.split(s, 1) - while parts: - # If there are less than 4 parts, it can't be encoded and we're done - if len(parts) < 5: - rtn.extend(parts) - break - # The first element is any non-encoded leading text - rtn.append(parts[0]) - charset = parts[1] - encoding = parts[2].lower() - atom = parts[3] - # The next chunk to decode should be in parts[4] - parts = ecre.split(parts[4]) - # The encoding must be either `q' or `b', case-insensitive - if encoding == 'q': - func = _qdecode - elif encoding == 'b': - func = _bdecode + for atom, charset in L: + if charset is None: + rtn.append(atom) else: - func = _identity - # Decode and get the unicode in the charset - rtn.append(unicode(func(atom), charset)) + # Convert the string to Unicode using the given encoding. Leave + # Unicode conversion errors to strict. + rtn.append(unicode(atom, charset)) # Now that we've decoded everything, we just need to join all the parts # together into the final string. return UEMPTYSTRING.join(rtn) @@ -96,6 +141,7 @@ def decode(s): def encode(s, charset='iso-8859-1', encoding='q'): """Encode a string according to RFC 2047.""" + warnings.warn('Use Header.Header.encode() instead.', DeprecationWarning, 2) encoding = encoding.lower() if encoding == 'q': estr = _qencode(s) @@ -150,3 +196,48 @@ def formatdate(timeval=None, localtime=0): 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1], now[0], now[3], now[4], now[5], zone) + + + +def make_msgid(idstring=None): + """Returns a string suitable for RFC 2822 compliant Message-ID:, e.g: + + <20020201195627.33539.96671@nightshade.la.mastaler.com> + + Optional idstring if given is a string used to strengthen the + uniqueness of the Message-ID, otherwise an empty string is used. + """ + timeval = time.time() + utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval)) + pid = os.getpid() + randint = random.randrange(100000) + if idstring is None: + idstring = '' + else: + idstring = '.' + idstring + idhost = socket.getfqdn() + msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost) + return msgid + + + +# These functions are in the standalone mimelib version only because they've +# subsequently been fixed in the latest Python versions. We use this to worm +# around broken older Pythons. +def parsedate(data): + if not data: + return None + return _parsedate(data) + + +def parsedate_tz(data): + if not data: + return None + return _parsedate_tz(data) + + +def parseaddr(addr): + realname, emailaddr = _parseaddr(addr) + if realname == '' and emailaddr is None: + return '', '' + return realname, emailaddr diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py index c13495b..f4a5b76 100644 --- a/Lib/email/__init__.py +++ b/Lib/email/__init__.py @@ -1,14 +1,16 @@ -# Copyright (C) 2001 Python Software Foundation +# Copyright (C) 2001,2002 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """A package for parsing, handling, and generating email messages. """ -__version__ = '1.0' +__version__ = '2.0' -__all__ = ['Encoders', +__all__ = ['Charset', + 'Encoders', 'Errors', 'Generator', + 'Header', 'Iterators', 'MIMEAudio', 'MIMEBase', @@ -18,6 +20,8 @@ __all__ = ['Encoders', 'Message', 'Parser', 'Utils', + 'base64MIME', + 'quopriMIME', 'message_from_string', 'message_from_file', ] diff --git a/Lib/email/base64MIME.py b/Lib/email/base64MIME.py new file mode 100644 index 0000000..08420b2 --- /dev/null +++ b/Lib/email/base64MIME.py @@ -0,0 +1,174 @@ +# Copyright (C) 2002 Python Software Foundation +# Author: che@debian.org (Ben Gertzfield) + +"""Base64 content transfer encoding per RFCs 2045-2047. + +This module handles the content transfer encoding method defined in RFC 2045 +to encode arbitrary 8-bit data using the three 8-bit bytes in four 7-bit +characters encoding known as Base64. + +It is used in the MIME standards for email to attach images, audio, and text +using some 8-bit character sets to messages. + +This module provides an interface to encode and decode both headers and bodies +with Base64 encoding. + +RFC 2045 defines a method for including character set information in an +`encoded-word' in a header. This method is commonly used for 8-bit real names +in To:, From:, Cc:, etc. fields, as well as Subject: lines. + +This module does not do the line wrapping or end-of-line character conversion +necessary for proper internationalized headers; it only does dumb encoding and +decoding. To deal with the various line wrapping issues, use the email.Header +module. +""" + +import re +from binascii import b2a_base64, a2b_base64 +from email.Utils import fix_eols + +CRLF = '\r\n' +NL = '\n' +EMPTYSTRING = '' + +# See also Charset.py +MISC_LEN = 7 + + + +# Helpers +def base64_len(s): + """Return the length of s when it is encoded with base64.""" + groups_of_3, leftover = divmod(len(s), 3) + # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in. + # Thanks, Tim! + n = groups_of_3 * 4 + if leftover: + n += 4 + return n + + + +def header_encode(header, charset='iso-8859-1', keep_eols=0, maxlinelen=76, + eol=NL): + """Encode a single header line with Base64 encoding in a given charset. + + Defined in RFC 2045, this Base64 encoding is identical to normal Base64 + encoding, except that each line must be intelligently wrapped (respecting + the Base64 encoding), and subsequent lines must start with a space. + + charset names the character set to use to encode the header. It defaults + to iso-8859-1. + + End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted + to the canonical email line separator \\r\\n unless the keep_eols + parameter is set to true (the default is false). + + Each line of the header will be terminated in the value of eol, which + defaults to "\\n". Set this to "\\r\\n" if you are using the result of + this function directly in email. + + The resulting string will be in the form: + + "=?charset?b?WW/5ciBtYXp66XLrIHf8eiBhIGhhbXBzdGHuciBBIFlv+XIgbWF6euly?=\\n + =?charset?b?6yB3/HogYSBoYW1wc3Rh7nIgQkMgWW/5ciBtYXp66XLrIHf8eiBhIGhh?=" + + with each line wrapped at, at most, maxlinelen characters (defaults to 76 + characters). + """ + # Return empty headers unchanged + if not header: + return header + + if not keep_eols: + header = fix_eols(header) + + # Base64 encode each line, in encoded chunks no greater than maxlinelen in + # length, after the RFC chrome is added in. + base64ed = [] + max_encoded = maxlinelen - len(charset) - MISC_LEN + max_unencoded = max_encoded * 3 / 4 + + # BAW: Ben's original code used a step of max_unencoded, but I think it + # ought to be max_encoded. Otherwise, where's max_encoded used? I'm + # still not sure what the + for i in range(0, len(header), max_unencoded): + base64ed.append(b2a_base64(header[i:i+max_unencoded])) + + # Now add the RFC chrome to each encoded chunk + lines = [] + for line in base64ed: + # Ignore the last character of each line if it is a newline + if line[-1] == NL: + line = line[:-1] + # Add the chrome + lines.append('=?%s?b?%s?=' % (charset, line)) + # Glue the lines together and return it. BAW: should we be able to + # specify the leading whitespace in the joiner? + joiner = eol + ' ' + return joiner.join(lines) + + + +def encode(s, binary=1, maxlinelen=76, eol=NL): + """Encode a string with base64. + + Each line will be wrapped at, at most, maxlinelen characters (defaults to + 76 characters). + + If binary is false, end-of-line characters will be converted to the + canonical email end-of-line sequence \\r\\n. Otherwise they will be left + verbatim (this is the default). + + Each line of encoded text will end with eol, which defaults to "\\n". Set + this to "\r\n" if you will be using the result of this function directly + in an email. + """ + if not s: + return s + + if not binary: + s = fix_eols(s) + + encvec = [] + max_unencoded = maxlinelen * 3 / 4 + for i in range(0, len(s), max_unencoded): + # BAW: should encode() inherit b2a_base64()'s dubious behavior in + # adding a newline to the encoded string? + enc = b2a_base64(s[i:i + max_unencoded]) + if enc[-1] == NL and eol <> NL: + enc = enc[:-1] + eol + encvec.append(enc) + return EMPTYSTRING.join(encvec) + + +# For convenience and backwards compatibility w/ standard base64 module +body_encode = encode +encodestring = encode + + + +def decode(s, convert_eols=None): + """Decode a raw base64 string. + + If convert_eols is set to a string value, all canonical email linefeeds, + e.g. "\\r\\n", in the decoded text will be converted to the value of + convert_eols. os.linesep is a good choice for convert_eols if you are + decoding a text attachment. + + This function does not parse a full MIME header value encoded with + base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high + level email.Header class for that functionality. + """ + if not s: + return s + + dec = a2b_base64(s) + if convert_eols: + return dec.replace(CRLF, convert_eols) + return dec + + +# For convenience and backwards compatibility w/ standard base64 module +body_decode = decode +decodestring = decode diff --git a/Lib/email/quopriMIME.py b/Lib/email/quopriMIME.py new file mode 100644 index 0000000..002034e --- /dev/null +++ b/Lib/email/quopriMIME.py @@ -0,0 +1,312 @@ +# Copyright (C) 2001,2002 Python Software Foundation +# Author: che@debian.org (Ben Gertzfield) + +"""Quoted-printable content transfer encoding per RFCs 2045-2047. + +This module handles the content transfer encoding method defined in RFC 2045 +to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to +safely encode text that is in a character set similar to the 7-bit US ASCII +character set, but that includes some 8-bit characters that are normally not +allowed in email bodies or headers. + +Quoted-printable is very space-inefficient for encoding binary files; use the +email.base64MIME module for that instead. + +This module provides an interface to encode and decode both headers and bodies +with quoted-printable encoding. + +RFC 2045 defines a method for including character set information in an +`encoded-word' in a header. This method is commonly used for 8-bit real names +in To:/From:/Cc: etc. fields, as well as Subject: lines. + +This module does not do the line wrapping or end-of-line character +conversion necessary for proper internationalized headers; it only +does dumb encoding and decoding. To deal with the various line +wrapping issues, use the email.Header module. +""" + +import re +from string import hexdigits +from email.Utils import fix_eols + +CRLF = '\r\n' +NL = '\n' + +# See also Charset.py +MISC_LEN = 7 + +hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]') +bqre = re.compile(r'[^ !-<>-~\t]') + + + +# Helpers +def header_quopri_check(c): + """Return true if the character should be escaped with header quopri.""" + return hqre.match(c) and 1 + + +def body_quopri_check(c): + """Return true if the character should be escaped with body quopri.""" + return bqre.match(c) and 1 + + +def header_quopri_len(s): + """Return the length of str when it is encoded with header quopri.""" + count = 0 + for c in s: + if hqre.match(c): + count += 3 + else: + count += 1 + return count + + +def body_quopri_len(str): + """Return the length of str when it is encoded with body quopri.""" + count = 0 + for c in str: + if bqre.match(c): + count += 3 + else: + count += 1 + return count + + +def _max_append(L, s, maxlen, extra=''): + if not L: + L.append(s) + elif len(L[-1]) + len(s) < maxlen: + L[-1] += extra + s + else: + L.append(s) + + +def unquote(s): + """Turn a string in the form =AB to the ASCII character with value 0xab""" + return chr(int(s[1:3], 16)) + + +def quote(c): + return "=%02X" % ord(c) + + + +def header_encode(header, charset="iso-8859-1", keep_eols=0, maxlinelen=76, + eol=NL): + """Encode a single header line with quoted-printable (like) encoding. + + Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but + used specifically for email header fields to allow charsets with mostly 7 + bit characters (and some 8 bit) to remain more or less readable in non-RFC + 2045 aware mail clients. + + charset names the character set to use to encode the header. It defaults + to iso-8859-1. + + The resulting string will be in the form: + + "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n + =?charset?q?Silly_=C8nglish_Kn=EEghts?=" + + with each line wrapped safely at, at most, maxlinelen characters (defaults + to 76 characters). + + End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted + to the canonical email line separator \\r\\n unless the keep_eols + parameter is set to true (the default is false). + + Each line of the header will be terminated in the value of eol, which + defaults to "\\n". Set this to "\\r\\n" if you are using the result of + this function directly in email. + """ + # Return empty headers unchanged + if not header: + return header + + if not keep_eols: + header = fix_eols(header) + + # Quopri encode each line, in encoded chunks no greater than maxlinelen in + # lenght, after the RFC chrome is added in. + quoted = [] + max_encoded = maxlinelen - len(charset) - MISC_LEN + + for c in header: + # Space may be represented as _ instead of =20 for readability + if c == ' ': + _max_append(quoted, '_', max_encoded) + # These characters can be included verbatim + elif not hqre.match(c): + _max_append(quoted, c, max_encoded) + # Otherwise, replace with hex value like =E2 + else: + _max_append(quoted, "=%02X" % ord(c), max_encoded) + + # Now add the RFC chrome to each encoded chunk and glue the chunks + # together. BAW: should we be able to specify the leading whitespace in + # the joiner? + joiner = eol + ' ' + return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted]) + + + +def encode(body, binary=0, maxlinelen=76, eol=NL): + """Encode with quoted-printable, wrapping at maxlinelen characters. + + If binary is false (the default), end-of-line characters will be converted + to the canonical email end-of-line sequence \\r\\n. Otherwise they will + be left verbatim. + + Each line of encoded text will end with eol, which defaults to "\\n". Set + this to "\\r\\n" if you will be using the result of this function directly + in an email. + + Each line will be wrapped at, at most, maxlinelen characters (defaults to + 76 characters). Long lines will have the `soft linefeed' quoted-printable + character "=" appended to them, so the decoded text will be identical to + the original text. + """ + if not body: + return body + + if not binary: + body = fix_eols(body) + + # BAW: We're accumulating the body text by string concatenation. That + # can't be very efficient, but I don't have time now to rewrite it. It + # just feels like this algorithm could be more efficient. + encoded_body = '' + lineno = -1 + # Preserve line endings here so we can check later to see an eol needs to + # be added to the output later. + lines = body.splitlines(1) + for line in lines: + # But strip off line-endings for processing this line. + if line.endswith(CRLF): + line = line[:-2] + elif line[-1] in CRLF: + line = line[:-1] + + lineno += 1 + encoded_line = '' + prev = None + linelen = len(line) + # Now we need to examine every character to see if it needs to be + # quopri encoded. BAW: again, string concatenation is inefficient. + for j in range(linelen): + c = line[j] + prev = c + if bqre.match(c): + c = quote(c) + elif j+1 == linelen: + # Check for whitespace at end of line; special case + if c not in ' \t': + encoded_line += c + prev = c + continue + # Check to see to see if the line has reached its maximum length + if len(encoded_line) + len(c) >= maxlinelen: + encoded_body += encoded_line + '=' + eol + encoded_line = '' + encoded_line += c + # Now at end of line.. + if prev and prev in ' \t': + # Special case for whitespace at end of file + if lineno+1 == len(lines): + prev = quote(prev) + if len(encoded_line) + len(prev) > maxlinelen: + encoded_body += encoded_line + '=' + eol + prev + else: + encoded_body += encoded_line + prev + # Just normal whitespace at end of line + else: + encoded_body += encoded_line + prev + '=' + eol + encoded_line = '' + # Now look at the line we just finished and it has a line ending, we + # need to add eol to the end of the line. + if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF: + encoded_body += encoded_line + eol + else: + encoded_body += encoded_line + encoded_line = '' + return encoded_body + + +# For convenience and backwards compatibility w/ standard base64 module +body_encode = encode +encodestring = encode + + + +# BAW: I'm not sure if the intent was for the signature of this function to be +# the same as base64MIME.decode() or not... +def decode(encoded, eol=NL): + """Decode a quoted-printable string. + + Lines are separated with eol, which defaults to \\n. + """ + if not encoded: + return encoded + # BAW: see comment in encode() above. Again, we're building up the + # decoded string with string concatenation, which could be done much more + # efficiently. + decoded = '' + + for line in encoded.splitlines(): + line = line.rstrip() + if not line: + decoded += eol + continue + + i = 0 + n = len(line) + while i < n: + c = line[i] + if c <> '=': + decoded += c + i += 1 + # Otherwise, c == "=". Are we at the end of the line? If so, add + # a soft line break. + elif i+1 == n: + i += 1 + continue + # Decode if in form =AB + elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits: + decoded += unquote(line[i:i+3]) + i += 3 + # Otherwise, not in form =AB, pass literally + else: + decoded += c + i += 1 + + if i == n: + decoded += eol + # Special case if original string did not end with eol + if encoded[-1] <> eol and decoded[-1] == eol: + decoded = decoded[:-1] + return decoded + + +# For convenience and backwards compatibility w/ standard base64 module +body_decode = decode +decodestring = decode + + + +def _unquote_match(match): + """Turn a match in the form =AB to the ASCII character with value 0xab""" + s = match.group(0) + return unquote(s) + + +# Header decoding is done a bit differently +def header_decode(s): + """Decode a string encoded with RFC 2045 MIME header `Q' encoding. + + This function does not parse a full MIME header value encoded with + quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use + the high level email.Header class for that functionality. + """ + s = s.replace('_', ' ') + return re.sub(r'=\w{2}', _unquote_match, s) |