summaryrefslogtreecommitdiffstats
path: root/Lib/email
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>2002-04-10 21:01:31 (GMT)
committerBarry Warsaw <barry@python.org>2002-04-10 21:01:31 (GMT)
commit409a4c08b545aa064cf8fe3b8de51404756a301e (patch)
tree06cf8fe44e1fe28fbc0147635ec41961f2df6515 /Lib/email
parent68e69338ae19c37bd3e69cb76e107bfa76231e06 (diff)
downloadcpython-409a4c08b545aa064cf8fe3b8de51404756a301e.zip
cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.gz
cpython-409a4c08b545aa064cf8fe3b8de51404756a301e.tar.bz2
Sync'ing with standalone email package 2.0.1. This adds support for
non-us-ascii character sets in headers and bodies. Some API changes (with DeprecationWarnings for the old APIs). Better RFC-compliant implementations of base64 and quoted-printable. Updated test cases. Documentation updates to follow (after I finish writing them ;).
Diffstat (limited to 'Lib/email')
-rw-r--r--Lib/email/Charset.py327
-rw-r--r--Lib/email/Encoders.py10
-rw-r--r--Lib/email/Errors.py2
-rw-r--r--Lib/email/Generator.py54
-rw-r--r--Lib/email/Header.py210
-rw-r--r--Lib/email/Iterators.py2
-rw-r--r--Lib/email/MIMEBase.py2
-rw-r--r--Lib/email/MIMEImage.py2
-rw-r--r--Lib/email/MIMEMessage.py2
-rw-r--r--Lib/email/MIMEText.py35
-rw-r--r--Lib/email/Message.py223
-rw-r--r--Lib/email/Parser.py26
-rw-r--r--Lib/email/Utils.py143
-rw-r--r--Lib/email/__init__.py10
-rw-r--r--Lib/email/base64MIME.py174
-rw-r--r--Lib/email/quopriMIME.py312
16 files changed, 1438 insertions, 96 deletions
diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py
new file mode 100644
index 0000000..4874597
--- /dev/null
+++ b/Lib/email/Charset.py
@@ -0,0 +1,327 @@
+# Copyright (C) 2001,2002 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield)
+
+from types import UnicodeType
+from email.Encoders import encode_7or8bit
+import email.base64MIME
+import email.quopriMIME
+
+
+
+# Flags for types of header encodings
+QP = 1 # Quoted-Printable
+BASE64 = 2 # Base64
+
+# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
+MISC_LEN = 7
+
+DEFAULT_CHARSET = 'us-ascii'
+
+
+
+# Defaults
+CHARSETS = {
+ # input header enc body enc output conv
+ 'iso-8859-1': (QP, QP, None),
+ 'iso-8859-2': (QP, QP, None),
+ 'us-ascii': (None, None, None),
+ 'big5': (BASE64, BASE64, None),
+ 'gb2312': (BASE64, BASE64, None),
+ 'euc-jp': (BASE64, None, 'iso-2022-jp'),
+ 'shift_jis': (BASE64, None, 'iso-2022-jp'),
+ 'iso-2022-jp': (BASE64, None, None),
+ 'koi8-r': (BASE64, BASE64, None),
+ 'utf-8': (BASE64, BASE64, 'utf-8'),
+ }
+
+# Aliases for other commonly-used names for character sets. Map
+# them to the real ones used in email.
+ALIASES = {
+ 'latin_1': 'iso-8859-1',
+ 'latin-1': 'iso-8859-1',
+ 'ascii': 'us-ascii',
+ }
+
+# Map charsets to their Unicode codec strings. Note that the Japanese
+# examples included below do not (yet) come with Python! They are available
+# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
+
+# The Chinese and Korean codecs are available from SourceForge:
+#
+# http://sourceforge.net/projects/python-codecs/
+#
+# although you'll need to check them out of cvs since they haven't been file
+# released yet. You might also try to use
+#
+# http://www.freshports.org/port-description.php3?port=6702
+#
+# if you can get logged in. AFAICT, both the Chinese and Korean codecs are
+# fairly experimental at this point.
+CODEC_MAP = {
+ 'euc-jp': 'japanese.euc-jp',
+ 'iso-2022-jp': 'japanese.iso-2022-jp',
+ 'shift_jis': 'japanese.shift_jis',
+ 'gb2132': 'eucgb2312_cn',
+ 'big5': 'big5_tw',
+ 'utf-8': 'utf-8',
+ # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
+ # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
+ # Let that stuff pass through without conversion to/from Unicode.
+ 'us-ascii': None,
+ }
+
+
+
+# Convenience functions for extending the above mappings
+def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
+ """Add charset properties to the global map.
+
+ charset is the input character set, and must be the canonical name of a
+ character set.
+
+ Optional header_enc and body_enc is either Charset.QP for
+ quoted-printable, Charset.BASE64 for base64 encoding, or None for no
+ encoding. It describes how message headers and message bodies in the
+ input charset are to be encoded. Default is no encoding.
+
+ Optional output_charset is the character set that the output should be
+ in. Conversions will proceed from input charset, to Unicode, to the
+ output charset when the method Charset.convert() is called. The default
+ is to output in the same character set as the input.
+
+ Both input_charset and output_charset must have Unicode codec entries in
+ the module's charset-to-codec mapping; use add_codec(charset, codecname)
+ to add codecs the module does not know about. See the codec module's
+ documentation for more information.
+ """
+ CHARSETS[charset] = (header_enc, body_enc, output_charset)
+
+
+def add_alias(alias, canonical):
+ """Add a character set alias.
+
+ alias is the alias name, e.g. latin-1
+ canonical is the character set's canonical name, e.g. iso-8859-1
+ """
+ ALIASES[alias] = canonical
+
+
+def add_codec(charset, codecname):
+ """Add a codec that map characters in the given charset to/from Unicode.
+
+ charset is the canonical name of a character set. codecname is the name
+ of a Python codec, as appropriate for the second argument to the unicode()
+ built-in, or to the .encode() method of a Unicode string.
+ """
+ CODEC_MAP[charset] = codecname
+
+
+
+class Charset:
+ """Map character sets to their email properties.
+
+ This class provides information about the requirements imposed on email
+ for a specific character set. It also provides convenience routines for
+ converting between character sets, given the availability of the
+ applicable codecs. Given an character set, it will do its best to provide
+ information on how to use that character set in an email.
+
+ Certain character sets must be encoded with quoted-printable or base64
+ when used in email headers or bodies. Certain character sets must be
+ converted outright, and are not allowed in email. Instances of this
+ module expose the following information about a character set:
+
+ input_charset: The initial character set specified. Common aliases
+ are converted to their `official' email names (e.g. latin_1
+ is converted to iso-8859-1). Defaults to 7-bit us-ascii.
+
+ header_encoding: If the character set must be encoded before it can be
+ used in an email header, this attribute will be set to
+ Charset.QP (for quoted-printable) or Charset.BASE64 (for
+ base64 encoding). Otherwise, it will be None.
+
+ body_encoding: Same as header_encoding, but describes the encoding for the
+ mail message's body, which indeed may be different than the
+ header encoding.
+
+ output_charset: Some character sets must be converted before the can be
+ used in email headers or bodies. If the input_charset is
+ one of them, this attribute will contain the name of the
+ charset output will be converted to. Otherwise, it will
+ be None.
+
+ input_codec: The name of the Python codec used to convert the
+ input_charset to Unicode. If no conversion codec is
+ necessary, this attribute will be None.
+
+ output_codec: The name of the Python codec used to convert Unicode
+ to the output_charset. If no conversion codec is necessary,
+ this attribute will have the same value as the input_codec.
+ """
+ def __init__(self, input_charset=DEFAULT_CHARSET):
+ # Set the input charset after filtering through the aliases
+ self.input_charset = ALIASES.get(input_charset, input_charset)
+ # We can try to guess which encoding and conversion to use by the
+ # charset_map dictionary. Try that first, but let the user override
+ # it.
+ henc, benc, conv = CHARSETS.get(self.input_charset,
+ (BASE64, BASE64, None))
+ # Set the attributes, allowing the arguments to override the default.
+ self.header_encoding = henc
+ self.body_encoding = benc
+ self.output_charset = ALIASES.get(conv, conv)
+ # Now set the codecs. If one isn't defined for input_charset,
+ # guess and try a Unicode codec with the same name as input_codec.
+ self.input_codec = CODEC_MAP.get(self.input_charset,
+ self.input_charset)
+ self.output_codec = CODEC_MAP.get(self.output_charset,
+ self.input_codec)
+
+ def __str__(self):
+ return self.input_charset.lower()
+
+ def __eq__(self, other):
+ return str(self) == str(other).lower()
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ def get_body_encoding(self):
+ """Return the content-transfer-encoding used for body encoding.
+
+ This is either the string `quoted-printable' or `base64' depending on
+ the encoding used, or it is a function in which case you should call
+ the function with a single argument, the Message object being
+ encoded. The function should then set the Content-Transfer-Encoding:
+ header itself to whatever is appropriate.
+
+ Returns "quoted-printable" if self.body_encoding is QP.
+ Returns "base64" if self.body_encoding is BASE64.
+ Returns "7bit" otherwise.
+ """
+ if self.body_encoding == QP:
+ return 'quoted-printable'
+ elif self.body_encoding == BASE64:
+ return 'base64'
+ else:
+ return encode_7or8bit
+
+ def convert(self, s):
+ """Convert a string from the input_codec to the output_codec."""
+ if self.input_codec <> self.output_codec:
+ return unicode(s, self.input_codec).encode(self.output_codec)
+ else:
+ return s
+
+ def to_splittable(self, s):
+ """Convert a possibly multibyte string to a safely splittable format.
+
+ Uses the input_codec to try and convert the string to Unicode, so it
+ can be safely split on character boundaries (even for double-byte
+ characters).
+
+ Returns the string untouched if we don't know how to convert it to
+ Unicode with the input_charset.
+
+ Characters that could not be converted to Unicode will be replaced
+ with the Unicode replacement character U+FFFD.
+ """
+ if isinstance(s, UnicodeType) or self.input_codec is None:
+ return s
+ try:
+ return unicode(s, self.input_codec, 'replace')
+ except LookupError:
+ # Input codec not installed on system, so return the original
+ # string unchanged.
+ return s
+
+ def from_splittable(self, ustr, to_output=1):
+ """Convert a splittable string back into an encoded string.
+
+ Uses the proper codec to try and convert the string from
+ Unicode back into an encoded format. Return the string as-is
+ if it is not Unicode, or if it could not be encoded from
+ Unicode.
+
+ Characters that could not be converted from Unicode will be replaced
+ with an appropriate character (usually '?').
+
+ If to_output is true, uses output_codec to convert to an encoded
+ format. If to_output is false, uses input_codec. to_output defaults
+ to 1.
+ """
+ if to_output:
+ codec = self.output_codec
+ else:
+ codec = self.input_codec
+ if not isinstance(ustr, UnicodeType) or codec is None:
+ return ustr
+ try:
+ return ustr.encode(codec, 'replace')
+ except LookupError:
+ # Output codec not installed
+ return ustr
+
+ def get_output_charset(self):
+ """Return the output character set.
+
+ This is self.output_charset if that is set, otherwise it is
+ self.input_charset.
+ """
+ return self.output_charset or self.input_charset
+
+ def encoded_header_len(self, s):
+ """Return the length of the encoded header string."""
+ cset = self.get_output_charset()
+ # The len(s) of a 7bit encoding is len(s)
+ if self.header_encoding is BASE64:
+ return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
+ elif self.header_encoding is QP:
+ return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
+ else:
+ return len(s)
+
+ def header_encode(self, s, convert=0):
+ """Header-encode a string, optionally converting it to output_charset.
+
+ If convert is true, the string will be converted from the input
+ charset to the output charset automatically. This is not useful for
+ multibyte character sets, which have line length issues (multibyte
+ characters must be split on a character, not a byte boundary); use the
+ high-level Header class to deal with these issues. convert defaults
+ to 0.
+
+ The type of encoding (base64 or quoted-printable) will be based on
+ self.header_encoding.
+ """
+ cset = self.get_output_charset()
+ if convert:
+ s = self.convert(s)
+ # 7bit/8bit encodings return the string unchanged (modulo conversions)
+ if self.header_encoding is BASE64:
+ return email.base64MIME.header_encode(s, cset)
+ elif self.header_encoding is QP:
+ return email.quopriMIME.header_encode(s, cset)
+ else:
+ return s
+
+ def body_encode(self, s, convert=1):
+ """Body-encode a string and convert it to output_charset.
+
+ If convert is true (the default), the string will be converted from
+ the input charset to output charset automatically. Unlike
+ header_encode(), there are no issues with byte boundaries and
+ multibyte charsets in email bodies, so this is usually pretty safe.
+
+ The type of encoding (base64 or quoted-printable) will be based on
+ self.body_encoding.
+ """
+ if convert:
+ s = self.convert(s)
+ # 7bit/8bit encodings return the string unchanged (module conversions)
+ if self.body_encoding is BASE64:
+ return email.base64MIME.body_encode(s)
+ elif self.header_encoding is QP:
+ return email.quopriMIME.body_encode(s)
+ else:
+ return s
diff --git a/Lib/email/Encoders.py b/Lib/email/Encoders.py
index d9cd42d..f09affa 100644
--- a/Lib/email/Encoders.py
+++ b/Lib/email/Encoders.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Module containing encoding functions for Image.Image and Text.Text.
@@ -11,7 +11,9 @@ from quopri import encodestring as _encodestring
# Helpers
def _qencode(s):
- return _encodestring(s, quotetabs=1)
+ enc = _encodestring(s, quotetabs=1)
+ # Must encode spaces, which quopri.encodestring() doesn't do
+ return enc.replace(' ', '=20')
def _bencode(s):
@@ -54,6 +56,10 @@ def encode_quopri(msg):
def encode_7or8bit(msg):
"""Set the Content-Transfer-Encoding: header to 7bit or 8bit."""
orig = msg.get_payload()
+ if orig is None:
+ # There's no payload. For backwards compatibility we use 7bit
+ msg['Content-Transfer-Encoding'] = '7bit'
+ return
# We play a trick to make this go fast. If encoding to ASCII succeeds, we
# know the data must be 7bit, otherwise treat it as 8bit.
try:
diff --git a/Lib/email/Errors.py b/Lib/email/Errors.py
index 71d7663..e3a3666 100644
--- a/Lib/email/Errors.py
+++ b/Lib/email/Errors.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""email package exception classes.
diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py
index 981e0ff..dbbcabc 100644
--- a/Lib/email/Generator.py
+++ b/Lib/email/Generator.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Classes to generate plain text from a message object tree.
@@ -166,30 +166,33 @@ class Generator:
return text
rtn = []
for line in text.split('\n'):
+ splitline = []
# Short lines can remain unchanged
if len(line.replace('\t', SPACE8)) <= maxheaderlen:
- rtn.append(line)
- SEMINLTAB.join(rtn)
+ splitline.append(line)
+ rtn.append(SEMINLTAB.join(splitline))
else:
- oldlen = len(text)
+ oldlen = len(line)
# Try to break the line on semicolons, but if that doesn't
# work, try to split on folding whitespace.
- while len(text) > maxheaderlen:
- i = text.rfind(';', 0, maxheaderlen)
+ while len(line) > maxheaderlen:
+ i = line.rfind(';', 0, maxheaderlen)
if i < 0:
break
- rtn.append(text[:i])
- text = text[i+1:].lstrip()
- if len(text) <> oldlen:
+ splitline.append(line[:i])
+ line = line[i+1:].lstrip()
+ if len(line) <> oldlen:
# Splitting on semis worked
- rtn.append(text)
- return SEMINLTAB.join(rtn)
+ splitline.append(line)
+ rtn.append(SEMINLTAB.join(splitline))
+ continue
# Splitting on semis didn't help, so try to split on
# whitespace.
- parts = re.split(r'(\s+)', text)
+ parts = re.split(r'(\s+)', line)
# Watch out though for "Header: longnonsplittableline"
if parts[0].endswith(':') and len(parts) == 3:
- return text
+ rtn.append(line)
+ continue
first = parts.pop(0)
sublines = [first]
acc = len(first)
@@ -203,13 +206,14 @@ class Generator:
else:
# Split it here, but don't forget to ignore the
# next whitespace-only part
- rtn.append(EMPTYSTRING.join(sublines))
+ splitline.append(EMPTYSTRING.join(sublines))
del parts[0]
first = parts.pop(0)
sublines = [first]
acc = len(first)
- rtn.append(EMPTYSTRING.join(sublines))
- return NLTAB.join(rtn)
+ splitline.append(EMPTYSTRING.join(sublines))
+ rtn.append(NLTAB.join(splitline))
+ return NL.join(rtn)
#
# Handlers for writing types and subtypes
@@ -219,6 +223,9 @@ class Generator:
payload = msg.get_payload()
if payload is None:
return
+ cset = msg.get_charset()
+ if cset is not None:
+ payload = cset.body_encode(payload)
if not isinstance(payload, StringType):
raise TypeError, 'string payload expected: %s' % type(payload)
if self._mangle_from_:
@@ -233,7 +240,18 @@ class Generator:
# together, and then make sure that the boundary we've chosen isn't
# present in the payload.
msgtexts = []
- for part in msg.get_payload():
+ subparts = msg.get_payload()
+ if subparts is None:
+ # Nothing has every been attached
+ boundary = msg.get_boundary(failobj=_make_boundary())
+ print >> self._fp, '--' + boundary
+ print >> self._fp, '\n'
+ print >> self._fp, '--' + boundary + '--'
+ return
+ elif not isinstance(subparts, ListType):
+ # Scalar payload
+ subparts = [subparts]
+ for part in subparts:
s = StringIO()
g = self.__class__(s, self._mangle_from_, self.__maxheaderlen)
g(part, unixfrom=0)
@@ -365,7 +383,7 @@ class DecodedGenerator(Generator):
# Helper
-def _make_boundary(self, text=None):
+def _make_boundary(text=None):
# Craft a random boundary. If text is given, ensure that the chosen
# boundary doesn't appear in the text.
boundary = ('=' * 15) + repr(random.random()).split('.')[1] + '=='
diff --git a/Lib/email/Header.py b/Lib/email/Header.py
new file mode 100644
index 0000000..097b978
--- /dev/null
+++ b/Lib/email/Header.py
@@ -0,0 +1,210 @@
+# Copyright (C) 2002 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield)
+
+"""Header encoding and decoding functionality."""
+
+import re
+import email.quopriMIME
+import email.base64MIME
+from email.Charset import Charset
+
+CRLFSPACE = '\r\n '
+CRLF = '\r\n'
+NLSPACE = '\n '
+
+MAXLINELEN = 76
+
+ENCODE = 1
+DECODE = 2
+
+# Match encoded-word strings in the form =?charset?q?Hello_World?=
+ecre = re.compile(r'''
+ =\? # literal =?
+ (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
+ \? # literal ?
+ (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
+ \? # literal ?
+ (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
+ \?= # literal ?=
+ ''', re.VERBOSE | re.IGNORECASE)
+
+
+
+# Helpers
+_max_append = email.quopriMIME._max_append
+
+
+
+def decode_header(header):
+ """Decode a message header value without converting charset.
+
+ Returns a list of (decoded_string, charset) pairs containing each of the
+ decoded parts of the header. Charset is None for non-encoded parts of the
+ header, otherwise a lower-case string containing the name of the character
+ set specified in the encoded string.
+ """
+ # If no encoding, just return the header
+ header = str(header)
+ if not ecre.search(header):
+ return [(header, None)]
+
+ decoded = []
+ dec = ''
+ for line in header.splitlines():
+ # This line might not have an encoding in it
+ if not ecre.search(line):
+ decoded.append((line, None))
+ continue
+
+ parts = ecre.split(line)
+ while parts:
+ unenc = parts.pop(0).strip()
+ if unenc:
+ # Should we continue a long line?
+ if decoded and decoded[-1][1] is None:
+ decoded[-1] = (decoded[-1][0] + dec, None)
+ else:
+ decoded.append((unenc, None))
+ if parts:
+ charset, encoding = [s.lower() for s in parts[0:2]]
+ encoded = parts[2]
+ dec = ''
+ if encoding == 'q':
+ dec = email.quopriMIME.header_decode(encoded)
+ elif encoding == 'b':
+ dec = email.base64MIME.decode(encoded)
+ else:
+ dec = encoded
+
+ if decoded and decoded[-1][1] == charset:
+ decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
+ else:
+ decoded.append((dec, charset))
+ del parts[0:3]
+ return decoded
+
+
+
+class Header:
+ def __init__(self, s, charset=None, maxlinelen=MAXLINELEN,
+ header_name=None):
+ """Create a MIME-compliant header that can contain many languages.
+
+ Specify the initial header value in s. Specify its character set as a
+ Charset object in the charset argument. If none, a default Charset
+ instance will be used.
+
+ You can later append to the header with append(s, charset) below;
+ charset does not have to be the same as the one initially specified
+ here. In fact, it's optional, and if not given, defaults to the
+ charset specified in the constructor.
+
+ The maximum line length can either be specified by maxlinelen, or you
+ can pass in the name of the header field (e.g. "Subject") to let this
+ class guess the best line length to use to prevent wrapping. The
+ default maxlinelen is 76.
+ """
+ if charset is None:
+ charset = Charset()
+ self._charset = charset
+ # BAW: I believe `chunks' and `maxlinelen' should be non-public.
+ self._chunks = []
+ self.append(s, charset)
+ self._maxlinelen = maxlinelen
+ if header_name is not None:
+ self.guess_maxlinelen(header_name)
+
+ def __str__(self):
+ """A synonym for self.encode()."""
+ return self.encode()
+
+ def guess_maxlinelen(self, s=None):
+ """Guess the maximum length to make each header line.
+
+ Given a header name (e.g. "Subject"), set this header's maximum line
+ length to an appropriate length to avoid line wrapping. If s is not
+ given, return the previous maximum line length and don't set it.
+
+ Returns the new maximum line length.
+ """
+ # BAW: is this semantic necessary?
+ if s is not None:
+ self._maxlinelen = MAXLINELEN - len(s) - 2
+ return self._maxlinelen
+
+ def append(self, s, charset=None):
+ """Append string s with Charset charset to the MIME header.
+
+ charset defaults to the one given in the class constructor.
+ """
+ if charset is None:
+ charset = self._charset
+ self._chunks.append((s, charset))
+
+ def _split(self, s, charset):
+ # Split up a header safely for use with encode_chunks. BAW: this
+ # appears to be a private convenience method.
+ splittable = charset.to_splittable(s)
+ encoded = charset.from_splittable(splittable)
+
+ if charset.encoded_header_len(encoded) < self._maxlinelen:
+ return [(encoded, charset)]
+ else:
+ # Divide and conquer. BAW: halfway depends on integer division.
+ # When porting to Python 2.2, use the // operator.
+ halfway = len(splittable) // 2
+ first = charset.from_splittable(splittable[:halfway], 0)
+ last = charset.from_splittable(splittable[halfway:], 0)
+ return self._split(first, charset) + self._split(last, charset)
+
+ def encode(self):
+ """Encode a message header, possibly converting charset and encoding.
+
+ There are many issues involved in converting a given string for use in
+ an email header. Only certain character sets are readable in most
+ email clients, and as header strings can only contain a subset of
+ 7-bit ASCII, care must be taken to properly convert and encode (with
+ Base64 or quoted-printable) header strings. In addition, there is a
+ 75-character length limit on any given encoded header field, so
+ line-wrapping must be performed, even with double-byte character sets.
+
+ This method will do its best to convert the string to the correct
+ character set used in email, and encode and line wrap it safely with
+ the appropriate scheme for that character set.
+
+ If the given charset is not known or an error occurs during
+ conversion, this function will return the header untouched.
+ """
+ newchunks = []
+ for s, charset in self._chunks:
+ newchunks += self._split(s, charset)
+ self._chunks = newchunks
+ return self.encode_chunks()
+
+ def encode_chunks(self):
+ """MIME-encode a header with many different charsets and/or encodings.
+
+ Given a list of pairs (string, charset), return a MIME-encoded string
+ suitable for use in a header field. Each pair may have different
+ charsets and/or encodings, and the resulting header will accurately
+ reflect each setting.
+
+ Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
+ character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
+ non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
+ (no encoding).
+
+ Each pair will be represented on a separate line; the resulting string
+ will be in the format:
+
+ "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
+ =?charset2?b?SvxyZ2VuIEL2aW5n?="
+ """
+ chunks = []
+ for header, charset in self._chunks:
+ if charset is None:
+ _max_append(chunks, header, self._maxlinelen, ' ')
+ else:
+ _max_append(chunks, charset.header_encode(header, 0),
+ self._maxlinelen, ' ')
+ return NLSPACE.join(chunks)
diff --git a/Lib/email/Iterators.py b/Lib/email/Iterators.py
index a64495d..515bac9 100644
--- a/Lib/email/Iterators.py
+++ b/Lib/email/Iterators.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Various types of useful iterators and generators.
diff --git a/Lib/email/MIMEBase.py b/Lib/email/MIMEBase.py
index 33216f6..28816e8 100644
--- a/Lib/email/MIMEBase.py
+++ b/Lib/email/MIMEBase.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Base class for MIME specializations.
diff --git a/Lib/email/MIMEImage.py b/Lib/email/MIMEImage.py
index 963da23..f0e7931a 100644
--- a/Lib/email/MIMEImage.py
+++ b/Lib/email/MIMEImage.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Class representing image/* type MIME documents.
diff --git a/Lib/email/MIMEMessage.py b/Lib/email/MIMEMessage.py
index fc4b2c6..89da925 100644
--- a/Lib/email/MIMEMessage.py
+++ b/Lib/email/MIMEMessage.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Class representing message/* MIME documents.
diff --git a/Lib/email/MIMEText.py b/Lib/email/MIMEText.py
index ccce9fb..8669d28 100644
--- a/Lib/email/MIMEText.py
+++ b/Lib/email/MIMEText.py
@@ -1,9 +1,10 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Class representing text/* type MIME documents.
"""
+import warnings
import MIMEBase
from Encoders import encode_7or8bit
@@ -13,7 +14,7 @@ class MIMEText(MIMEBase.MIMEBase):
"""Class for generating text/* type MIME documents."""
def __init__(self, _text, _subtype='plain', _charset='us-ascii',
- _encoder=encode_7or8bit):
+ _encoder=None):
"""Create a text/* type MIME document.
_text is the string for this message object. If the text does not end
@@ -22,20 +23,26 @@ class MIMEText(MIMEBase.MIMEBase):
_subtype is the MIME sub content type, defaulting to "plain".
_charset is the character set parameter added to the Content-Type:
- header. This defaults to "us-ascii".
-
- _encoder is a function which will perform the actual encoding for
- transport of the text data. It takes one argument, which is this
- Text instance. It should use get_payload() and set_payload() to
- change the payload to the encoded form. It should also add any
- Content-Transfer-Encoding: or other headers to the message as
- necessary. The default encoding doesn't actually modify the payload,
- but it does set Content-Transfer-Encoding: to either `7bit' or `8bit'
- as appropriate.
+ header. This defaults to "us-ascii". Note that as a side-effect, the
+ Content-Transfer-Encoding: header will also be set.
+
+ The use of the _encoder is deprecated. The encoding of the payload,
+ and the setting of the character set parameter now happens implicitly
+ based on the _charset argument. If _encoder is supplied, then a
+ DeprecationWarning is used, and the _encoder functionality may
+ override any header settings indicated by _charset. This is probably
+ not what you want.
"""
MIMEBase.MIMEBase.__init__(self, 'text', _subtype,
**{'charset': _charset})
if _text and _text[-1] <> '\n':
_text += '\n'
- self.set_payload(_text)
- _encoder(self)
+ self.set_payload(_text, _charset)
+ if _encoder is not None:
+ warnings.warn('_encoder argument is obsolete.',
+ DeprecationWarning, 2)
+ # Because set_payload() with a _charset will set its own
+ # Content-Transfer-Encoding: header, we need to delete the
+ # existing one or will end up with two of them. :(
+ del self['content-transfer-encoding']
+ _encoder(self)
diff --git a/Lib/email/Message.py b/Lib/email/Message.py
index 91931a1..71d10c4 100644
--- a/Lib/email/Message.py
+++ b/Lib/email/Message.py
@@ -1,23 +1,47 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Basic message object for the email package object model.
"""
-from __future__ import generators
-
import re
-import base64
-import quopri
+import warnings
from cStringIO import StringIO
-from types import ListType
+from types import ListType, StringType
# Intrapackage imports
import Errors
import Utils
+import Charset
SEMISPACE = '; '
+
+# Regular expression used to split header parameters. BAW: this may be too
+# simple. It isn't strictly RFC 2045 (section 5.1) compliant, but it catches
+# most headers found in the wild. We may eventually need a full fledged
+# parser eventually.
paramre = re.compile(r'\s*;\s*')
+# Regular expression that matches `special' characters in parameters, the
+# existance of which force quoting of the parameter value.
+tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
+
+
+
+# Helper function
+def _formatparam(param, value=None, quote=1):
+ """Convenience function to format and return a key=value pair.
+
+ Will quote the value if needed or if quote is true.
+ """
+ if value is not None and len(value) > 0:
+ # BAW: Please check this. I think that if quote is set it should
+ # force quoting even if not necessary.
+ if quote or tspecials.search(value):
+ return '%s="%s"' % (param, Utils.quote(value))
+ else:
+ return '%s=%s' % (param, value)
+ else:
+ return param
@@ -39,6 +63,7 @@ class Message:
self._headers = []
self._unixfrom = None
self._payload = None
+ self._charset = None
# Defaults for multipart messages
self.preamble = self.epilogue = None
@@ -83,6 +108,8 @@ class Message:
If the current payload is empty, then the current payload will be made
a scalar, set to the given value.
"""
+ warnings.warn('add_payload() is deprecated, use attach() instead.',
+ DeprecationWarning, 2)
if self._payload is None:
self._payload = payload
elif type(self._payload) is ListType:
@@ -93,8 +120,18 @@ class Message:
else:
self._payload = [self._payload, payload]
- # A useful synonym
- attach = add_payload
+ def attach(self, payload):
+ """Add the given payload to the current payload.
+
+ The current payload will always be a list of objects after this method
+ is called. If you want to set the payload to a scalar object
+ (e.g. because you're attaching a message/rfc822 subpart), use
+ set_payload() instead.
+ """
+ if self._payload is None:
+ self._payload = [payload]
+ else:
+ self._payload.append(payload)
def get_payload(self, i=None, decode=0):
"""Return the current payload exactly as is.
@@ -128,10 +165,58 @@ class Message:
return payload
- def set_payload(self, payload):
- """Set the payload to the given value."""
+ def set_payload(self, payload, charset=None):
+ """Set the payload to the given value.
+
+ Optionally set the charset, which must be a Charset instance."""
self._payload = payload
+ if charset is not None:
+ self.set_charset(charset)
+
+ def set_charset(self, charset):
+ """Set the charset of the payload to a given character set.
+
+ charset can be a string or a Charset object. If it is a string, it
+ will be converted to a Charset object by calling Charset's
+ constructor. If charset is None, the charset parameter will be
+ removed from the Content-Type: field. Anything else will generate a
+ TypeError.
+
+ The message will be assumed to be a text message encoded with
+ charset.input_charset. It will be converted to charset.output_charset
+ and encoded properly, if needed, when generating the plain text
+ representation of the message. MIME headers (MIME-Version,
+ Content-Type, Content-Transfer-Encoding) will be added as needed.
+ """
+ if charset is None:
+ self.del_param('charset')
+ self._charset = None
+ return
+ if isinstance(charset, StringType):
+ charset = Charset.Charset(charset)
+ if not isinstance(charset, Charset.Charset):
+ raise TypeError, charset
+ # BAW: should we accept strings that can serve as arguments to the
+ # Charset constructor?
+ self._charset = charset
+ if not self.has_key('MIME-Version'):
+ self.add_header('MIME-Version', '1.0')
+ if not self.has_key('Content-Type'):
+ self.add_header('Content-Type', 'text/plain',
+ charset=charset.get_output_charset())
+ else:
+ self.set_param('charset', charset.get_output_charset())
+ if not self.has_key('Content-Transfer-Encoding'):
+ cte = charset.get_body_encoding()
+ if callable(cte):
+ cte(self)
+ else:
+ self.add_header('Content-Transfer-Encoding', cte)
+ def get_charset(self):
+ """Return the Charset object associated with the message's payload."""
+ return self._charset
+
#
# MAPPING INTERFACE (partial)
#
@@ -257,7 +342,7 @@ class Message:
if v is None:
parts.append(k.replace('_', '-'))
else:
- parts.append('%s="%s"' % (k.replace('_', '-'), v))
+ parts.append(_formatparam(k.replace('_', '-'), v))
if _value is not None:
parts.insert(0, _value)
self._headers.append((_name, SEMISPACE.join(parts)))
@@ -308,6 +393,8 @@ class Message:
for p in paramre.split(value):
try:
name, val = p.split('=', 1)
+ name = name.rstrip()
+ val = val.lstrip()
except ValueError:
# Must have been a bare attribute
name = p
@@ -315,26 +402,29 @@ class Message:
params.append((name, val))
return params
- def get_params(self, failobj=None, header='content-type'):
+ def get_params(self, failobj=None, header='content-type', unquote=1):
"""Return the message's Content-Type: parameters, as a list.
The elements of the returned list are 2-tuples of key/value pairs, as
split on the `=' sign. The left hand side of the `=' is the key,
while the right hand side is the value. If there is no `=' sign in
the parameter the value is the empty string. The value is always
- unquoted.
+ unquoted, unless unquote is set to a false value.
Optional failobj is the object to return if there is no Content-Type:
header. Optional header is the header to search instead of
- Content-Type:
+ Content-Type:.
"""
missing = []
params = self._get_params_preserve(missing, header)
if params is missing:
return failobj
- return [(k, Utils.unquote(v)) for k, v in params]
+ if unquote:
+ return [(k, Utils.unquote(v)) for k, v in params]
+ else:
+ return params
- def get_param(self, param, failobj=None, header='content-type'):
+ def get_param(self, param, failobj=None, header='content-type', unquote=1):
"""Return the parameter value if found in the Content-Type: header.
Optional failobj is the object to return if there is no Content-Type:
@@ -342,15 +432,112 @@ class Message:
Content-Type:
Parameter keys are always compared case insensitively. Values are
- always unquoted.
+ always unquoted, unless unquote is set to a false value.
"""
if not self.has_key(header):
return failobj
for k, v in self._get_params_preserve(failobj, header):
if k.lower() == param.lower():
- return Utils.unquote(v)
+ if unquote:
+ return Utils.unquote(v)
+ else:
+ return v
return failobj
+ def set_param(self, param, value, header='Content-Type', requote=1):
+ """Set a parameter in the Content-Type: header.
+
+ If the parameter already exists in the header, its value will be
+ replaced with the new value.
+
+ If header is Content-Type: and has not yet been defined in this
+ message, it will be set to "text/plain" and the new parameter and
+ value will be appended, as per RFC 2045.
+
+ An alternate header can specified in the header argument, and
+ all parameters will be quoted as appropriate unless requote is
+ set to a false value.
+ """
+ if not self.has_key(header) and header.lower() == 'content-type':
+ ctype = 'text/plain'
+ else:
+ ctype = self.get(header)
+ if not self.get_param(param, header=header):
+ if not ctype:
+ ctype = _formatparam(param, value, requote)
+ else:
+ ctype = SEMISPACE.join(
+ [ctype, _formatparam(param, value, requote)])
+ else:
+ ctype = ''
+ for old_param, old_value in self.get_params(header=header,
+ unquote=requote):
+ append_param = ''
+ if old_param.lower() == param.lower():
+ append_param = _formatparam(param, value, requote)
+ else:
+ append_param = _formatparam(old_param, old_value, requote)
+ if not ctype:
+ ctype = append_param
+ else:
+ ctype = SEMISPACE.join([ctype, append_param])
+ if ctype <> self.get(header):
+ del self[header]
+ self[header] = ctype
+
+ def del_param(self, param, header='content-type', requote=1):
+ """Remove the given parameter completely from the Content-Type header.
+
+ The header will be re-written in place without param or its value.
+ All values will be quoted as appropriate unless requote is set to a
+ false value.
+ """
+ if not self.has_key(header):
+ return
+ new_ctype = ''
+ for p, v in self.get_params(header, unquote=requote):
+ if p.lower() <> param.lower():
+ if not new_ctype:
+ new_ctype = _formatparam(p, v, requote)
+ else:
+ new_ctype = SEMISPACE.join([new_ctype,
+ _formatparam(p, v, requote)])
+ if new_ctype <> self.get(header):
+ del self[header]
+ self[header] = new_ctype
+
+ def set_type(self, type, header='Content-Type', requote=1):
+ """Set the main type and subtype for the Content-Type: header.
+
+ type must be a string in the form "maintype/subtype", otherwise a
+ ValueError is raised.
+
+ This method replaces the Content-Type: header, keeping all the
+ parameters in place. If requote is false, this leaves the existing
+ header's quoting as is. Otherwise, the parameters will be quoted (the
+ default).
+
+ An alternate header can be specified in the header argument. When the
+ Content-Type: header is set, we'll always also add a MIME-Version:
+ header.
+ """
+ # BAW: should we be strict?
+ if not type.count('/') == 1:
+ raise ValueError
+ # Set the Content-Type: you get a MIME-Version:
+ if header.lower() == 'content-type':
+ del self['mime-version']
+ self['MIME-Version'] = '1.0'
+ if not self.has_key(header):
+ self[header] = type
+ return
+ params = self.get_params(header, unquote=requote)
+ del self[header]
+ self[header] = type
+ # Skip the first param; it's the old type.
+ for p, v in params[1:]:
+ self.set_param(p, v, header, requote)
+
def get_filename(self, failobj=None):
"""Return the filename associated with the payload if present.
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py
index 2f131d6..7177dfc 100644
--- a/Lib/email/Parser.py
+++ b/Lib/email/Parser.py
@@ -51,9 +51,16 @@ class Parser:
lastvalue = []
lineno = 0
while 1:
- line = fp.readline()[:-1]
- if not line or not line.strip():
+ # Don't strip the line before we test for the end condition,
+ # because whitespace-only header lines are RFC compliant
+ # continuation lines.
+ line = fp.readline()
+ if not line:
break
+ line = line.splitlines()[0]
+ if not line:
+ break
+ # Ignore the trailing newline
lineno += 1
# Check for initial Unix From_ line
if line.startswith('From '):
@@ -63,7 +70,6 @@ class Parser:
else:
raise Errors.HeaderParseError(
'Unix-from in headers after first rfc822 header')
- #
# Header continuation line
if line[0] in ' \t':
if not lastheader:
@@ -134,11 +140,11 @@ class Parser:
msgobj = self.parsestr(part)
container.preamble = preamble
container.epilogue = epilogue
- # Ensure that the container's payload is a list
- if not isinstance(container.get_payload(), ListType):
- container.set_payload([msgobj])
- else:
- container.add_payload(msgobj)
+ container.attach(msgobj)
+ elif container.get_main_type() == 'multipart':
+ # Very bad. A message is a multipart with no boundary!
+ raise Errors.BoundaryError(
+ 'multipart message with no defined boundary')
elif container.get_type() == 'message/delivery-status':
# This special kind of type contains blocks of headers separated
# by a blank line. We'll represent each header block as a
@@ -160,9 +166,9 @@ class Parser:
except Errors.HeaderParseError:
msg = self._class()
self._parsebody(msg, fp)
- container.add_payload(msg)
+ container.set_payload(msg)
else:
- container.add_payload(fp.read())
+ container.set_payload(fp.read())
diff --git a/Lib/email/Utils.py b/Lib/email/Utils.py
index 3d48287..887be55 100644
--- a/Lib/email/Utils.py
+++ b/Lib/email/Utils.py
@@ -1,16 +1,26 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Miscellaneous utilities.
"""
import time
+import socket
import re
+import random
+import os
+import warnings
+from cStringIO import StringIO
+from types import ListType
-from rfc822 import unquote, quote, parseaddr
-from rfc822 import dump_address_pair
+from rfc822 import unquote, quote
from rfc822 import AddrlistClass as _AddrlistClass
-from rfc822 import parsedate_tz, parsedate, mktime_tz
+from rfc822 import mktime_tz
+
+# We need wormarounds for bugs in these methods in older Pythons (see below)
+from rfc822 import parsedate as _parsedate
+from rfc822 import parsedate_tz as _parsedate_tz
+from rfc822 import parseaddr as _parseaddr
from quopri import decodestring as _qdecode
import base64
@@ -20,6 +30,10 @@ from Encoders import _bencode, _qencode
COMMASPACE = ', '
UEMPTYSTRING = u''
+CRLF = '\r\n'
+
+specialsre = re.compile(r'[][\()<>@,:;".]')
+escapesre = re.compile(r'[][\()"]')
@@ -44,6 +58,41 @@ def _bdecode(s):
+def fix_eols(s):
+ """Replace all line-ending characters with \r\n."""
+ # Fix newlines with no preceding carriage return
+ s = re.sub(r'(?<!\r)\n', CRLF, s)
+ # Fix carriage returns with no following newline
+ s = re.sub(r'\r(?!\n)', CRLF, s)
+ return s
+
+
+
+def formataddr(pair):
+ """The inverse of parseaddr(), this takes a 2-tuple of the form
+ (realname, email_address) and returns the string value suitable
+ for an RFC 2822 From:, To: or Cc:.
+
+ If the first element of pair is false, then the second element is
+ returned unmodified.
+ """
+ name, address = pair
+ if name:
+ quotes = ''
+ if specialsre.search(name):
+ quotes = '"'
+ name = escapesre.sub(r'\\\g<0>', name)
+ return '%s%s%s <%s>' % (quotes, name, quotes, address)
+ return address
+
+# For backwards compatibility
+def dump_address_pair(pair):
+ warnings.warn('Use email.Utils.formataddr() instead',
+ DeprecationWarning, 2)
+ return formataddr(pair)
+
+
+
def getaddresses(fieldvalues):
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
all = COMMASPACE.join(fieldvalues)
@@ -64,30 +113,26 @@ ecre = re.compile(r'''
def decode(s):
- """Return a decoded string according to RFC 2047, as a unicode string."""
+ """Return a decoded string according to RFC 2047, as a unicode string.
+
+ NOTE: This function is deprecated. Use Header.decode_header() instead.
+ """
+ warnings.warn('Use Header.decode_header() instead.', DeprecationWarning, 2)
+ # Intra-package import here to avoid circular import problems.
+ from Header import decode_header
+ L = decode_header(s)
+ if not isinstance(L, ListType):
+ # s wasn't decoded
+ return s
+
rtn = []
- parts = ecre.split(s, 1)
- while parts:
- # If there are less than 4 parts, it can't be encoded and we're done
- if len(parts) < 5:
- rtn.extend(parts)
- break
- # The first element is any non-encoded leading text
- rtn.append(parts[0])
- charset = parts[1]
- encoding = parts[2].lower()
- atom = parts[3]
- # The next chunk to decode should be in parts[4]
- parts = ecre.split(parts[4])
- # The encoding must be either `q' or `b', case-insensitive
- if encoding == 'q':
- func = _qdecode
- elif encoding == 'b':
- func = _bdecode
+ for atom, charset in L:
+ if charset is None:
+ rtn.append(atom)
else:
- func = _identity
- # Decode and get the unicode in the charset
- rtn.append(unicode(func(atom), charset))
+ # Convert the string to Unicode using the given encoding. Leave
+ # Unicode conversion errors to strict.
+ rtn.append(unicode(atom, charset))
# Now that we've decoded everything, we just need to join all the parts
# together into the final string.
return UEMPTYSTRING.join(rtn)
@@ -96,6 +141,7 @@ def decode(s):
def encode(s, charset='iso-8859-1', encoding='q'):
"""Encode a string according to RFC 2047."""
+ warnings.warn('Use Header.Header.encode() instead.', DeprecationWarning, 2)
encoding = encoding.lower()
if encoding == 'q':
estr = _qencode(s)
@@ -150,3 +196,48 @@ def formatdate(timeval=None, localtime=0):
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
now[0], now[3], now[4], now[5],
zone)
+
+
+
+def make_msgid(idstring=None):
+ """Returns a string suitable for RFC 2822 compliant Message-ID:, e.g:
+
+ <20020201195627.33539.96671@nightshade.la.mastaler.com>
+
+ Optional idstring if given is a string used to strengthen the
+ uniqueness of the Message-ID, otherwise an empty string is used.
+ """
+ timeval = time.time()
+ utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
+ pid = os.getpid()
+ randint = random.randrange(100000)
+ if idstring is None:
+ idstring = ''
+ else:
+ idstring = '.' + idstring
+ idhost = socket.getfqdn()
+ msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
+ return msgid
+
+
+
+# These functions are in the standalone mimelib version only because they've
+# subsequently been fixed in the latest Python versions. We use this to worm
+# around broken older Pythons.
+def parsedate(data):
+ if not data:
+ return None
+ return _parsedate(data)
+
+
+def parsedate_tz(data):
+ if not data:
+ return None
+ return _parsedate_tz(data)
+
+
+def parseaddr(addr):
+ realname, emailaddr = _parseaddr(addr)
+ if realname == '' and emailaddr is None:
+ return '', ''
+ return realname, emailaddr
diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py
index c13495b..f4a5b76 100644
--- a/Lib/email/__init__.py
+++ b/Lib/email/__init__.py
@@ -1,14 +1,16 @@
-# Copyright (C) 2001 Python Software Foundation
+# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""A package for parsing, handling, and generating email messages.
"""
-__version__ = '1.0'
+__version__ = '2.0'
-__all__ = ['Encoders',
+__all__ = ['Charset',
+ 'Encoders',
'Errors',
'Generator',
+ 'Header',
'Iterators',
'MIMEAudio',
'MIMEBase',
@@ -18,6 +20,8 @@ __all__ = ['Encoders',
'Message',
'Parser',
'Utils',
+ 'base64MIME',
+ 'quopriMIME',
'message_from_string',
'message_from_file',
]
diff --git a/Lib/email/base64MIME.py b/Lib/email/base64MIME.py
new file mode 100644
index 0000000..08420b2
--- /dev/null
+++ b/Lib/email/base64MIME.py
@@ -0,0 +1,174 @@
+# Copyright (C) 2002 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield)
+
+"""Base64 content transfer encoding per RFCs 2045-2047.
+
+This module handles the content transfer encoding method defined in RFC 2045
+to encode arbitrary 8-bit data using the three 8-bit bytes in four 7-bit
+characters encoding known as Base64.
+
+It is used in the MIME standards for email to attach images, audio, and text
+using some 8-bit character sets to messages.
+
+This module provides an interface to encode and decode both headers and bodies
+with Base64 encoding.
+
+RFC 2045 defines a method for including character set information in an
+`encoded-word' in a header. This method is commonly used for 8-bit real names
+in To:, From:, Cc:, etc. fields, as well as Subject: lines.
+
+This module does not do the line wrapping or end-of-line character conversion
+necessary for proper internationalized headers; it only does dumb encoding and
+decoding. To deal with the various line wrapping issues, use the email.Header
+module.
+"""
+
+import re
+from binascii import b2a_base64, a2b_base64
+from email.Utils import fix_eols
+
+CRLF = '\r\n'
+NL = '\n'
+EMPTYSTRING = ''
+
+# See also Charset.py
+MISC_LEN = 7
+
+
+
+# Helpers
+def base64_len(s):
+ """Return the length of s when it is encoded with base64."""
+ groups_of_3, leftover = divmod(len(s), 3)
+ # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
+ # Thanks, Tim!
+ n = groups_of_3 * 4
+ if leftover:
+ n += 4
+ return n
+
+
+
+def header_encode(header, charset='iso-8859-1', keep_eols=0, maxlinelen=76,
+ eol=NL):
+ """Encode a single header line with Base64 encoding in a given charset.
+
+ Defined in RFC 2045, this Base64 encoding is identical to normal Base64
+ encoding, except that each line must be intelligently wrapped (respecting
+ the Base64 encoding), and subsequent lines must start with a space.
+
+ charset names the character set to use to encode the header. It defaults
+ to iso-8859-1.
+
+ End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
+ to the canonical email line separator \\r\\n unless the keep_eols
+ parameter is set to true (the default is false).
+
+ Each line of the header will be terminated in the value of eol, which
+ defaults to "\\n". Set this to "\\r\\n" if you are using the result of
+ this function directly in email.
+
+ The resulting string will be in the form:
+
+ "=?charset?b?WW/5ciBtYXp66XLrIHf8eiBhIGhhbXBzdGHuciBBIFlv+XIgbWF6euly?=\\n
+ =?charset?b?6yB3/HogYSBoYW1wc3Rh7nIgQkMgWW/5ciBtYXp66XLrIHf8eiBhIGhh?="
+
+ with each line wrapped at, at most, maxlinelen characters (defaults to 76
+ characters).
+ """
+ # Return empty headers unchanged
+ if not header:
+ return header
+
+ if not keep_eols:
+ header = fix_eols(header)
+
+ # Base64 encode each line, in encoded chunks no greater than maxlinelen in
+ # length, after the RFC chrome is added in.
+ base64ed = []
+ max_encoded = maxlinelen - len(charset) - MISC_LEN
+ max_unencoded = max_encoded * 3 / 4
+
+ # BAW: Ben's original code used a step of max_unencoded, but I think it
+ # ought to be max_encoded. Otherwise, where's max_encoded used? I'm
+ # still not sure what the
+ for i in range(0, len(header), max_unencoded):
+ base64ed.append(b2a_base64(header[i:i+max_unencoded]))
+
+ # Now add the RFC chrome to each encoded chunk
+ lines = []
+ for line in base64ed:
+ # Ignore the last character of each line if it is a newline
+ if line[-1] == NL:
+ line = line[:-1]
+ # Add the chrome
+ lines.append('=?%s?b?%s?=' % (charset, line))
+ # Glue the lines together and return it. BAW: should we be able to
+ # specify the leading whitespace in the joiner?
+ joiner = eol + ' '
+ return joiner.join(lines)
+
+
+
+def encode(s, binary=1, maxlinelen=76, eol=NL):
+ """Encode a string with base64.
+
+ Each line will be wrapped at, at most, maxlinelen characters (defaults to
+ 76 characters).
+
+ If binary is false, end-of-line characters will be converted to the
+ canonical email end-of-line sequence \\r\\n. Otherwise they will be left
+ verbatim (this is the default).
+
+ Each line of encoded text will end with eol, which defaults to "\\n". Set
+ this to "\r\n" if you will be using the result of this function directly
+ in an email.
+ """
+ if not s:
+ return s
+
+ if not binary:
+ s = fix_eols(s)
+
+ encvec = []
+ max_unencoded = maxlinelen * 3 / 4
+ for i in range(0, len(s), max_unencoded):
+ # BAW: should encode() inherit b2a_base64()'s dubious behavior in
+ # adding a newline to the encoded string?
+ enc = b2a_base64(s[i:i + max_unencoded])
+ if enc[-1] == NL and eol <> NL:
+ enc = enc[:-1] + eol
+ encvec.append(enc)
+ return EMPTYSTRING.join(encvec)
+
+
+# For convenience and backwards compatibility w/ standard base64 module
+body_encode = encode
+encodestring = encode
+
+
+
+def decode(s, convert_eols=None):
+ """Decode a raw base64 string.
+
+ If convert_eols is set to a string value, all canonical email linefeeds,
+ e.g. "\\r\\n", in the decoded text will be converted to the value of
+ convert_eols. os.linesep is a good choice for convert_eols if you are
+ decoding a text attachment.
+
+ This function does not parse a full MIME header value encoded with
+ base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
+ level email.Header class for that functionality.
+ """
+ if not s:
+ return s
+
+ dec = a2b_base64(s)
+ if convert_eols:
+ return dec.replace(CRLF, convert_eols)
+ return dec
+
+
+# For convenience and backwards compatibility w/ standard base64 module
+body_decode = decode
+decodestring = decode
diff --git a/Lib/email/quopriMIME.py b/Lib/email/quopriMIME.py
new file mode 100644
index 0000000..002034e
--- /dev/null
+++ b/Lib/email/quopriMIME.py
@@ -0,0 +1,312 @@
+# Copyright (C) 2001,2002 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield)
+
+"""Quoted-printable content transfer encoding per RFCs 2045-2047.
+
+This module handles the content transfer encoding method defined in RFC 2045
+to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
+safely encode text that is in a character set similar to the 7-bit US ASCII
+character set, but that includes some 8-bit characters that are normally not
+allowed in email bodies or headers.
+
+Quoted-printable is very space-inefficient for encoding binary files; use the
+email.base64MIME module for that instead.
+
+This module provides an interface to encode and decode both headers and bodies
+with quoted-printable encoding.
+
+RFC 2045 defines a method for including character set information in an
+`encoded-word' in a header. This method is commonly used for 8-bit real names
+in To:/From:/Cc: etc. fields, as well as Subject: lines.
+
+This module does not do the line wrapping or end-of-line character
+conversion necessary for proper internationalized headers; it only
+does dumb encoding and decoding. To deal with the various line
+wrapping issues, use the email.Header module.
+"""
+
+import re
+from string import hexdigits
+from email.Utils import fix_eols
+
+CRLF = '\r\n'
+NL = '\n'
+
+# See also Charset.py
+MISC_LEN = 7
+
+hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')
+bqre = re.compile(r'[^ !-<>-~\t]')
+
+
+
+# Helpers
+def header_quopri_check(c):
+ """Return true if the character should be escaped with header quopri."""
+ return hqre.match(c) and 1
+
+
+def body_quopri_check(c):
+ """Return true if the character should be escaped with body quopri."""
+ return bqre.match(c) and 1
+
+
+def header_quopri_len(s):
+ """Return the length of str when it is encoded with header quopri."""
+ count = 0
+ for c in s:
+ if hqre.match(c):
+ count += 3
+ else:
+ count += 1
+ return count
+
+
+def body_quopri_len(str):
+ """Return the length of str when it is encoded with body quopri."""
+ count = 0
+ for c in str:
+ if bqre.match(c):
+ count += 3
+ else:
+ count += 1
+ return count
+
+
+def _max_append(L, s, maxlen, extra=''):
+ if not L:
+ L.append(s)
+ elif len(L[-1]) + len(s) < maxlen:
+ L[-1] += extra + s
+ else:
+ L.append(s)
+
+
+def unquote(s):
+ """Turn a string in the form =AB to the ASCII character with value 0xab"""
+ return chr(int(s[1:3], 16))
+
+
+def quote(c):
+ return "=%02X" % ord(c)
+
+
+
+def header_encode(header, charset="iso-8859-1", keep_eols=0, maxlinelen=76,
+ eol=NL):
+ """Encode a single header line with quoted-printable (like) encoding.
+
+ Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
+ used specifically for email header fields to allow charsets with mostly 7
+ bit characters (and some 8 bit) to remain more or less readable in non-RFC
+ 2045 aware mail clients.
+
+ charset names the character set to use to encode the header. It defaults
+ to iso-8859-1.
+
+ The resulting string will be in the form:
+
+ "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n
+ =?charset?q?Silly_=C8nglish_Kn=EEghts?="
+
+ with each line wrapped safely at, at most, maxlinelen characters (defaults
+ to 76 characters).
+
+ End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
+ to the canonical email line separator \\r\\n unless the keep_eols
+ parameter is set to true (the default is false).
+
+ Each line of the header will be terminated in the value of eol, which
+ defaults to "\\n". Set this to "\\r\\n" if you are using the result of
+ this function directly in email.
+ """
+ # Return empty headers unchanged
+ if not header:
+ return header
+
+ if not keep_eols:
+ header = fix_eols(header)
+
+ # Quopri encode each line, in encoded chunks no greater than maxlinelen in
+ # lenght, after the RFC chrome is added in.
+ quoted = []
+ max_encoded = maxlinelen - len(charset) - MISC_LEN
+
+ for c in header:
+ # Space may be represented as _ instead of =20 for readability
+ if c == ' ':
+ _max_append(quoted, '_', max_encoded)
+ # These characters can be included verbatim
+ elif not hqre.match(c):
+ _max_append(quoted, c, max_encoded)
+ # Otherwise, replace with hex value like =E2
+ else:
+ _max_append(quoted, "=%02X" % ord(c), max_encoded)
+
+ # Now add the RFC chrome to each encoded chunk and glue the chunks
+ # together. BAW: should we be able to specify the leading whitespace in
+ # the joiner?
+ joiner = eol + ' '
+ return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])
+
+
+
+def encode(body, binary=0, maxlinelen=76, eol=NL):
+ """Encode with quoted-printable, wrapping at maxlinelen characters.
+
+ If binary is false (the default), end-of-line characters will be converted
+ to the canonical email end-of-line sequence \\r\\n. Otherwise they will
+ be left verbatim.
+
+ Each line of encoded text will end with eol, which defaults to "\\n". Set
+ this to "\\r\\n" if you will be using the result of this function directly
+ in an email.
+
+ Each line will be wrapped at, at most, maxlinelen characters (defaults to
+ 76 characters). Long lines will have the `soft linefeed' quoted-printable
+ character "=" appended to them, so the decoded text will be identical to
+ the original text.
+ """
+ if not body:
+ return body
+
+ if not binary:
+ body = fix_eols(body)
+
+ # BAW: We're accumulating the body text by string concatenation. That
+ # can't be very efficient, but I don't have time now to rewrite it. It
+ # just feels like this algorithm could be more efficient.
+ encoded_body = ''
+ lineno = -1
+ # Preserve line endings here so we can check later to see an eol needs to
+ # be added to the output later.
+ lines = body.splitlines(1)
+ for line in lines:
+ # But strip off line-endings for processing this line.
+ if line.endswith(CRLF):
+ line = line[:-2]
+ elif line[-1] in CRLF:
+ line = line[:-1]
+
+ lineno += 1
+ encoded_line = ''
+ prev = None
+ linelen = len(line)
+ # Now we need to examine every character to see if it needs to be
+ # quopri encoded. BAW: again, string concatenation is inefficient.
+ for j in range(linelen):
+ c = line[j]
+ prev = c
+ if bqre.match(c):
+ c = quote(c)
+ elif j+1 == linelen:
+ # Check for whitespace at end of line; special case
+ if c not in ' \t':
+ encoded_line += c
+ prev = c
+ continue
+ # Check to see to see if the line has reached its maximum length
+ if len(encoded_line) + len(c) >= maxlinelen:
+ encoded_body += encoded_line + '=' + eol
+ encoded_line = ''
+ encoded_line += c
+ # Now at end of line..
+ if prev and prev in ' \t':
+ # Special case for whitespace at end of file
+ if lineno+1 == len(lines):
+ prev = quote(prev)
+ if len(encoded_line) + len(prev) > maxlinelen:
+ encoded_body += encoded_line + '=' + eol + prev
+ else:
+ encoded_body += encoded_line + prev
+ # Just normal whitespace at end of line
+ else:
+ encoded_body += encoded_line + prev + '=' + eol
+ encoded_line = ''
+ # Now look at the line we just finished and it has a line ending, we
+ # need to add eol to the end of the line.
+ if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
+ encoded_body += encoded_line + eol
+ else:
+ encoded_body += encoded_line
+ encoded_line = ''
+ return encoded_body
+
+
+# For convenience and backwards compatibility w/ standard base64 module
+body_encode = encode
+encodestring = encode
+
+
+
+# BAW: I'm not sure if the intent was for the signature of this function to be
+# the same as base64MIME.decode() or not...
+def decode(encoded, eol=NL):
+ """Decode a quoted-printable string.
+
+ Lines are separated with eol, which defaults to \\n.
+ """
+ if not encoded:
+ return encoded
+ # BAW: see comment in encode() above. Again, we're building up the
+ # decoded string with string concatenation, which could be done much more
+ # efficiently.
+ decoded = ''
+
+ for line in encoded.splitlines():
+ line = line.rstrip()
+ if not line:
+ decoded += eol
+ continue
+
+ i = 0
+ n = len(line)
+ while i < n:
+ c = line[i]
+ if c <> '=':
+ decoded += c
+ i += 1
+ # Otherwise, c == "=". Are we at the end of the line? If so, add
+ # a soft line break.
+ elif i+1 == n:
+ i += 1
+ continue
+ # Decode if in form =AB
+ elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
+ decoded += unquote(line[i:i+3])
+ i += 3
+ # Otherwise, not in form =AB, pass literally
+ else:
+ decoded += c
+ i += 1
+
+ if i == n:
+ decoded += eol
+ # Special case if original string did not end with eol
+ if encoded[-1] <> eol and decoded[-1] == eol:
+ decoded = decoded[:-1]
+ return decoded
+
+
+# For convenience and backwards compatibility w/ standard base64 module
+body_decode = decode
+decodestring = decode
+
+
+
+def _unquote_match(match):
+ """Turn a match in the form =AB to the ASCII character with value 0xab"""
+ s = match.group(0)
+ return unquote(s)
+
+
+# Header decoding is done a bit differently
+def header_decode(s):
+ """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
+
+ This function does not parse a full MIME header value encoded with
+ quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
+ the high level email.Header class for that functionality.
+ """
+ s = s.replace('_', ' ')
+ return re.sub(r'=\w{2}', _unquote_match, s)