summaryrefslogtreecommitdiffstats
path: root/Lib/email
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/email')
-rw-r--r--Lib/email/__init__.py19
-rw-r--r--Lib/email/_parseaddr.py20
-rw-r--r--Lib/email/base64mime.py8
-rw-r--r--Lib/email/charset.py33
-rw-r--r--Lib/email/encoders.py9
-rw-r--r--Lib/email/feedparser.py11
-rw-r--r--Lib/email/generator.py215
-rw-r--r--Lib/email/header.py35
-rw-r--r--Lib/email/message.py115
-rw-r--r--Lib/email/parser.py47
-rw-r--r--Lib/email/quoprimime.py6
-rw-r--r--Lib/email/test/data/msg_10.txt7
-rw-r--r--Lib/email/test/data/msg_26.txt3
-rw-r--r--Lib/email/test/test_email.py581
-rw-r--r--Lib/email/test/test_email_codecs.py46
-rw-r--r--Lib/email/test/test_email_torture.py6
-rw-r--r--Lib/email/utils.py11
17 files changed, 1015 insertions, 157 deletions
diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py
index 8702212..bd316fd 100644
--- a/Lib/email/__init__.py
+++ b/Lib/email/__init__.py
@@ -4,7 +4,7 @@
"""A package for parsing, handling, and generating email messages."""
-__version__ = '5.0.0'
+__version__ = '5.1.0'
__all__ = [
'base64mime',
@@ -16,7 +16,9 @@ __all__ = [
'iterators',
'message',
'message_from_file',
+ 'message_from_binary_file',
'message_from_string',
+ 'message_from_bytes',
'mime',
'parser',
'quoprimime',
@@ -36,6 +38,13 @@ def message_from_string(s, *args, **kws):
from email.parser import Parser
return Parser(*args, **kws).parsestr(s)
+def message_from_bytes(s, *args, **kws):
+ """Parse a bytes string into a Message object model.
+
+ Optional _class and strict are passed to the Parser constructor.
+ """
+ from email.parser import BytesParser
+ return BytesParser(*args, **kws).parsebytes(s)
def message_from_file(fp, *args, **kws):
"""Read a file and parse its contents into a Message object model.
@@ -44,3 +53,11 @@ def message_from_file(fp, *args, **kws):
"""
from email.parser import Parser
return Parser(*args, **kws).parse(fp)
+
+def message_from_binary_file(fp, *args, **kws):
+ """Read a binary file and parse its contents into a Message object model.
+
+ Optional _class and strict are passed to the Parser constructor.
+ """
+ from email.parser import BytesParser
+ return BytesParser(*args, **kws).parse(fp)
diff --git a/Lib/email/_parseaddr.py b/Lib/email/_parseaddr.py
index 3bd4ba4..41694f9 100644
--- a/Lib/email/_parseaddr.py
+++ b/Lib/email/_parseaddr.py
@@ -64,8 +64,10 @@ def parsedate_tz(data):
if len(data) == 4:
s = data[3]
i = s.find('+')
+ if i == -1:
+ i = s.find('-')
if i > 0:
- data[3:] = [s[:i], s[i+1:]]
+ data[3:] = [s[:i], s[i:]]
else:
data.append('') # Dummy tz
if len(data) < 5:
@@ -199,14 +201,18 @@ class AddrlistClass:
self.commentlist = []
def gotonext(self):
- """Parse up to the start of the next address."""
+ """Skip white space and extract comments."""
+ wslist = []
while self.pos < len(self.field):
if self.field[self.pos] in self.LWS + '\n\r':
+ if self.field[self.pos] not in '\n\r':
+ wslist.append(self.field[self.pos])
self.pos += 1
elif self.field[self.pos] == '(':
self.commentlist.append(self.getcomment())
else:
break
+ return EMPTYSTRING.join(wslist)
def getaddrlist(self):
"""Parse all addresses.
@@ -319,16 +325,24 @@ class AddrlistClass:
self.gotonext()
while self.pos < len(self.field):
+ preserve_ws = True
if self.field[self.pos] == '.':
+ if aslist and not aslist[-1].strip():
+ aslist.pop()
aslist.append('.')
self.pos += 1
+ preserve_ws = False
elif self.field[self.pos] == '"':
aslist.append('"%s"' % quote(self.getquote()))
elif self.field[self.pos] in self.atomends:
+ if aslist and not aslist[-1].strip():
+ aslist.pop()
break
else:
aslist.append(self.getatom())
- self.gotonext()
+ ws = self.gotonext()
+ if preserve_ws and ws:
+ aslist.append(ws)
if self.pos >= len(self.field) or self.field[self.pos] != '@':
return EMPTYSTRING.join(aslist)
diff --git a/Lib/email/base64mime.py b/Lib/email/base64mime.py
index 28e2542..f3bbac1 100644
--- a/Lib/email/base64mime.py
+++ b/Lib/email/base64mime.py
@@ -20,7 +20,7 @@ in To:, From:, Cc:, etc. fields, as well as Subject: lines.
This module does not do the line wrapping or end-of-line character conversion
necessary for proper internationalized headers; it only does dumb encoding and
-decoding. To deal with the various line wrapping issues, use the email.Header
+decoding. To deal with the various line wrapping issues, use the email.header
module.
"""
@@ -74,12 +74,12 @@ def header_encode(header_bytes, charset='iso-8859-1'):
def body_encode(s, maxlinelen=76, eol=NL):
- """Encode a string with base64.
+ r"""Encode a string with base64.
Each line will be wrapped at, at most, maxlinelen characters (defaults to
76 characters).
- Each line of encoded text will end with eol, which defaults to "\\n". Set
+ Each line of encoded text will end with eol, which defaults to "\n". Set
this to "\r\n" if you will be using the result of this function directly
in an email.
"""
@@ -104,7 +104,7 @@ def decode(string):
This function does not parse a full MIME header value encoded with
base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
- level email.Header class for that functionality.
+ level email.header class for that functionality.
"""
if not string:
return bytes()
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 898beed..f22be2c 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers
RFC2047_CHROME_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
EMPTYSTRING = ''
@@ -153,6 +154,16 @@ def add_codec(charset, codecname):
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+ if codec == UNKNOWN8BIT:
+ return string.encode('ascii', 'surrogateescape')
+ else:
+ return string.encode(codec)
+
+
+
class Charset:
"""Map character sets to their email properties.
@@ -252,7 +263,7 @@ class Charset:
Returns "quoted-printable" if self.body_encoding is QP.
Returns "base64" if self.body_encoding is BASE64.
- Returns "7bit" otherwise.
+ Returns conversion function otherwise.
"""
assert self.body_encoding != SHORTEST
if self.body_encoding == QP:
@@ -282,8 +293,7 @@ class Charset:
:return: The encoded string, with RFC 2047 chrome.
"""
codec = self.output_codec or 'us-ascii'
- charset = self.get_output_charset()
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
encoder_module = self._get_encoder(header_bytes)
if encoder_module is None:
@@ -309,9 +319,9 @@ class Charset:
"""
# See which encoding we should use.
codec = self.output_codec or 'us-ascii'
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
encoder_module = self._get_encoder(header_bytes)
- encoder = partial(encoder_module.header_encode, charset=str(self))
+ encoder = partial(encoder_module.header_encode, charset=codec)
# Calculate the number of characters that the RFC 2047 chrome will
# contribute to each line.
charset = self.get_output_charset()
@@ -333,7 +343,7 @@ class Charset:
for character in string:
current_line.append(character)
this_line = EMPTYSTRING.join(current_line)
- length = encoder_module.header_length(this_line.encode(charset))
+ length = encoder_module.header_length(_encode(this_line, charset))
if length > maxlen:
# This last character doesn't fit so pop it off.
current_line.pop()
@@ -343,12 +353,12 @@ class Charset:
else:
separator = (' ' if lines else '')
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
current_line = [character]
maxlen = next(maxlengths) - extra
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
return lines
@@ -371,7 +381,10 @@ class Charset:
"""Body-encode a string by converting it first to bytes.
The type of encoding (base64 or quoted-printable) will be based on
- self.body_encoding.
+ self.body_encoding. If body_encoding is None, we assume the
+ output charset is a 7bit encoding, so re-encoding the decoded
+ string using the ascii codec produces the correct string version
+ of the content.
"""
# 7bit/8bit encodings return the string unchanged (module conversions)
if self.body_encoding is BASE64:
@@ -381,4 +394,6 @@ class Charset:
elif self.body_encoding is QP:
return email.quoprimime.body_encode(string)
else:
+ if isinstance(string, str):
+ string = string.encode(self.output_charset).decode('ascii')
return string
diff --git a/Lib/email/encoders.py b/Lib/email/encoders.py
index dfaac58..e5c099f 100644
--- a/Lib/email/encoders.py
+++ b/Lib/email/encoders.py
@@ -54,10 +54,13 @@ def encode_7or8bit(msg):
# There's no payload. For backwards compatibility we use 7bit
msg['Content-Transfer-Encoding'] = '7bit'
return
- # We play a trick to make this go fast. If encoding to ASCII succeeds, we
- # know the data must be 7bit, otherwise treat it as 8bit.
+ # We play a trick to make this go fast. If encoding/decode to ASCII
+ # succeeds, we know the data must be 7bit, otherwise treat it as 8bit.
try:
- orig.encode('ascii')
+ if isinstance(orig, str):
+ orig.encode('ascii')
+ else:
+ orig.decode('ascii')
except UnicodeError:
# iso-2022-* is non-ASCII but still 7-bit
charset = msg.get_charset()
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 8db70b3..60a8325 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -368,12 +368,12 @@ class FeedParser:
end = len(mo.group(0))
self._last.epilogue = epilogue[:-end]
else:
- payload = self._last.get_payload()
+ payload = self._last._payload
if isinstance(payload, str):
mo = NLCRE_eol.search(payload)
if mo:
payload = payload[:-len(mo.group(0))]
- self._last.set_payload(payload)
+ self._last._payload = payload
self._input.pop_eof_matcher()
self._pop_message()
# Set the multipart up for newline cleansing, which will
@@ -482,3 +482,10 @@ class FeedParser:
if lastheader:
# XXX reconsider the joining of folded lines
self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
+
+
+class BytesFeedParser(FeedParser):
+ """Like FeedParser, but feed accepts bytes."""
+
+ def feed(self, data):
+ super().feed(data.decode('ascii', 'surrogateescape'))
diff --git a/Lib/email/generator.py b/Lib/email/generator.py
index cc30aff..f0e7a95 100644
--- a/Lib/email/generator.py
+++ b/Lib/email/generator.py
@@ -12,11 +12,12 @@ import time
import random
import warnings
-from io import StringIO
+from io import StringIO, BytesIO
from email.header import Header
+from email.message import _has_surrogates
UNDERSCORE = '_'
-NL = '\n'
+NL = '\n' # XXX: no longer used by the code below.
fcre = re.compile(r'^From ', re.MULTILINE)
@@ -57,8 +58,8 @@ class Generator:
# Just delegate to the file object
self._fp.write(s)
- def flatten(self, msg, unixfrom=False):
- """Print the message object tree rooted at msg to the output file
+ def flatten(self, msg, unixfrom=False, linesep='\n'):
+ r"""Print the message object tree rooted at msg to the output file
specified when the Generator instance was created.
unixfrom is a flag that forces the printing of a Unix From_ delimiter
@@ -67,12 +68,26 @@ class Generator:
is False to inhibit the printing of any From_ delimiter.
Note that for subobjects, no From_ line is printed.
+
+ linesep specifies the characters used to indicate a new line in
+ the output. The default value is the most useful for typical
+ Python applications, but it can be set to \r\n to produce RFC-compliant
+ line separators when needed.
+
"""
+ # We use the _XXX constants for operating on data that comes directly
+ # from the msg, and _encoded_XXX constants for operating on data that
+ # has already been converted (to bytes in the BytesGenerator) and
+ # inserted into a temporary buffer.
+ self._NL = linesep
+ self._encoded_NL = self._encode(linesep)
+ self._EMPTY = ''
+ self._encoded_EMTPY = self._encode('')
if unixfrom:
ufrom = msg.get_unixfrom()
if not ufrom:
ufrom = 'From nobody ' + time.ctime(time.time())
- print(ufrom, file=self._fp)
+ self.write(ufrom + self._NL)
self._write(msg)
def clone(self, fp):
@@ -83,6 +98,27 @@ class Generator:
# Protected interface - undocumented ;/
#
+ # Note that we use 'self.write' when what we are writing is coming from
+ # the source, and self._fp.write when what we are writing is coming from a
+ # buffer (because the Bytes subclass has already had a chance to transform
+ # the data in its write method in that case). This is an entirely
+ # pragmatic split determined by experiment; we could be more general by
+ # always using write and having the Bytes subclass write method detect when
+ # it has already transformed the input; but, since this whole thing is a
+ # hack anyway this seems good enough.
+
+ # Similarly, we have _XXX and _encoded_XXX attributes that are used on
+ # source and buffer data, respectively.
+ _encoded_EMPTY = ''
+
+ def _new_buffer(self):
+ # BytesGenerator overrides this to return BytesIO.
+ return StringIO()
+
+ def _encode(self, s):
+ # BytesGenerator overrides this to encode strings to bytes.
+ return s
+
def _write(self, msg):
# We can't write the headers yet because of the following scenario:
# say a multipart message includes the boundary string somewhere in
@@ -91,13 +127,13 @@ class Generator:
# parameter.
#
# The way we do this, so as to make the _handle_*() methods simpler,
- # is to cache any subpart writes into a StringIO. The we write the
- # headers and the StringIO contents. That way, subpart handlers can
+ # is to cache any subpart writes into a buffer. The we write the
+ # headers and the buffer contents. That way, subpart handlers can
# Do The Right Thing, and can still modify the Content-Type: header if
# necessary.
oldfp = self._fp
try:
- self._fp = sfp = StringIO()
+ self._fp = sfp = self._new_buffer()
self._dispatch(msg)
finally:
self._fp = oldfp
@@ -132,16 +168,17 @@ class Generator:
def _write_headers(self, msg):
for h, v in msg.items():
- print('%s:' % h, end=' ', file=self._fp)
+ self.write('%s: ' % h)
if isinstance(v, Header):
- print(v.encode(maxlinelen=self._maxheaderlen), file=self._fp)
+ self.write(v.encode(
+ maxlinelen=self._maxheaderlen, linesep=self._NL)+self._NL)
else:
# Header's got lots of smarts, so use it.
header = Header(v, maxlinelen=self._maxheaderlen,
header_name=h)
- print(header.encode(), file=self._fp)
+ self.write(header.encode(linesep=self._NL)+self._NL)
# A blank line always separates headers from body
- print(file=self._fp)
+ self.write(self._NL)
#
# Handlers for writing types and subtypes
@@ -153,9 +190,15 @@ class Generator:
return
if not isinstance(payload, str):
raise TypeError('string payload expected: %s' % type(payload))
+ if _has_surrogates(msg._payload):
+ charset = msg.get_param('charset')
+ if charset is not None:
+ del msg['content-transfer-encoding']
+ msg.set_payload(payload, charset)
+ payload = msg.get_payload()
if self._mangle_from_:
payload = fcre.sub('>From ', payload)
- self._fp.write(payload)
+ self.write(payload)
# Default body handler
_writeBody = _handle_text
@@ -170,29 +213,29 @@ class Generator:
subparts = []
elif isinstance(subparts, str):
# e.g. a non-strict parse of a message with no starting boundary.
- self._fp.write(subparts)
+ self.write(subparts)
return
elif not isinstance(subparts, list):
# Scalar payload
subparts = [subparts]
for part in subparts:
- s = StringIO()
+ s = self._new_buffer()
g = self.clone(s)
- g.flatten(part, unixfrom=False)
+ g.flatten(part, unixfrom=False, linesep=self._NL)
msgtexts.append(s.getvalue())
# BAW: What about boundaries that are wrapped in double-quotes?
boundary = msg.get_boundary()
if not boundary:
# Create a boundary that doesn't appear in any of the
# message texts.
- alltext = NL.join(msgtexts)
- boundary = _make_boundary(alltext)
+ alltext = self._encoded_NL.join(msgtexts)
+ boundary = self._make_boundary(alltext)
msg.set_boundary(boundary)
# If there's a preamble, write it out, with a trailing CRLF
if msg.preamble is not None:
- print(msg.preamble, file=self._fp)
+ self.write(msg.preamble + self._NL)
# dash-boundary transport-padding CRLF
- print('--' + boundary, file=self._fp)
+ self.write('--' + boundary + self._NL)
# body-part
if msgtexts:
self._fp.write(msgtexts.pop(0))
@@ -201,14 +244,14 @@ class Generator:
# --> CRLF body-part
for body_part in msgtexts:
# delimiter transport-padding CRLF
- print('\n--' + boundary, file=self._fp)
+ self.write(self._NL + '--' + boundary + self._NL)
# body-part
self._fp.write(body_part)
# close-delimiter transport-padding
- self._fp.write('\n--' + boundary + '--')
+ self.write(self._NL + '--' + boundary + '--')
if msg.epilogue is not None:
- print(file=self._fp)
- self._fp.write(msg.epilogue)
+ self.write(self._NL)
+ self.write(msg.epilogue)
def _handle_multipart_signed(self, msg):
# The contents of signed parts has to stay unmodified in order to keep
@@ -227,23 +270,23 @@ class Generator:
# block and the boundary. Sigh.
blocks = []
for part in msg.get_payload():
- s = StringIO()
+ s = self._new_buffer()
g = self.clone(s)
- g.flatten(part, unixfrom=False)
+ g.flatten(part, unixfrom=False, linesep=self._NL)
text = s.getvalue()
- lines = text.split('\n')
+ lines = text.split(self._encoded_NL)
# Strip off the unnecessary trailing empty line
- if lines and lines[-1] == '':
- blocks.append(NL.join(lines[:-1]))
+ if lines and lines[-1] == self._encoded_EMPTY:
+ blocks.append(self._encoded_NL.join(lines[:-1]))
else:
blocks.append(text)
# Now join all the blocks with an empty line. This has the lovely
# effect of separating each block with an empty line, but not adding
# an extra one after the last one.
- self._fp.write(NL.join(blocks))
+ self._fp.write(self._encoded_NL.join(blocks))
def _handle_message(self, msg):
- s = StringIO()
+ s = self._new_buffer()
g = self.clone(s)
# The payload of a message/rfc822 part should be a multipart sequence
# of length 1. The zeroth element of the list should be the Message
@@ -256,10 +299,98 @@ class Generator:
# in that case we just emit the string body.
payload = msg.get_payload()
if isinstance(payload, list):
- g.flatten(msg.get_payload(0), unixfrom=False)
+ g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL)
payload = s.getvalue()
self._fp.write(payload)
+ # This used to be a module level function; we use a classmethod for this
+ # and _compile_re so we can continue to provide the module level function
+ # for backward compatibility by doing
+ # _make_boudary = Generator._make_boundary
+ # at the end of the module. It *is* internal, so we could drop that...
+ @classmethod
+ def _make_boundary(cls, text=None):
+ # Craft a random boundary. If text is given, ensure that the chosen
+ # boundary doesn't appear in the text.
+ token = random.randrange(sys.maxsize)
+ boundary = ('=' * 15) + (_fmt % token) + '=='
+ if text is None:
+ return boundary
+ b = boundary
+ counter = 0
+ while True:
+ cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
+ if not cre.search(text):
+ break
+ b = boundary + '.' + str(counter)
+ counter += 1
+ return b
+
+ @classmethod
+ def _compile_re(cls, s, flags):
+ return re.compile(s, flags)
+
+
+class BytesGenerator(Generator):
+ """Generates a bytes version of a Message object tree.
+
+ Functionally identical to the base Generator except that the output is
+ bytes and not string. When surrogates were used in the input to encode
+ bytes, these are decoded back to bytes for output.
+
+ The outfp object must accept bytes in its write method.
+ """
+
+ # Bytes versions of this constant for use in manipulating data from
+ # the BytesIO buffer.
+ _encoded_EMPTY = b''
+
+ def write(self, s):
+ self._fp.write(s.encode('ascii', 'surrogateescape'))
+
+ def _new_buffer(self):
+ return BytesIO()
+
+ def _encode(self, s):
+ return s.encode('ascii')
+
+ def _write_headers(self, msg):
+ # This is almost the same as the string version, except for handling
+ # strings with 8bit bytes.
+ for h, v in msg._headers:
+ self.write('%s: ' % h)
+ if isinstance(v, Header):
+ self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
+ elif _has_surrogates(v):
+ # If we have raw 8bit data in a byte string, we have no idea
+ # what the encoding is. There is no safe way to split this
+ # string. If it's ascii-subset, then we could do a normal
+ # ascii split, but if it's multibyte then we could break the
+ # string. There's no way to know so the least harm seems to
+ # be to not split the string and risk it being too long.
+ self.write(v+NL)
+ else:
+ # Header's got lots of smarts and this string is safe...
+ header = Header(v, maxlinelen=self._maxheaderlen,
+ header_name=h)
+ self.write(header.encode(linesep=self._NL)+self._NL)
+ # A blank line always separates headers from body
+ self.write(self._NL)
+
+ def _handle_text(self, msg):
+ # If the string has surrogates the original source was bytes, so
+ # just write it back out.
+ if msg._payload is None:
+ return
+ if _has_surrogates(msg._payload):
+ self.write(msg._payload)
+ else:
+ super(BytesGenerator,self)._handle_text(msg)
+
+ @classmethod
+ def _compile_re(cls, s, flags):
+ return re.compile(s.encode('ascii'), flags)
+
_FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]'
@@ -320,23 +451,9 @@ class DecodedGenerator(Generator):
-# Helper
+# Helper used by Generator._make_boundary
_width = len(repr(sys.maxsize-1))
_fmt = '%%0%dd' % _width
-def _make_boundary(text=None):
- # Craft a random boundary. If text is given, ensure that the chosen
- # boundary doesn't appear in the text.
- token = random.randrange(sys.maxsize)
- boundary = ('=' * 15) + (_fmt % token) + '=='
- if text is None:
- return boundary
- b = boundary
- counter = 0
- while True:
- cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
- if not cre.search(text):
- break
- b = boundary + '.' + str(counter)
- counter += 1
- return b
+# Backward compatibility
+_make_boundary = Generator._make_boundary
diff --git a/Lib/email/header.py b/Lib/email/header.py
index da739d5..2562b30 100644
--- a/Lib/email/header.py
+++ b/Lib/email/header.py
@@ -17,7 +17,8 @@ import email.quoprimime
import email.base64mime
from email.errors import HeaderParseError
-from email.charset import Charset
+from email import charset as _charset
+Charset = _charset.Charset
NL = '\n'
SPACE = ' '
@@ -65,9 +66,15 @@ def decode_header(header):
otherwise a lower-case string containing the name of the character set
specified in the encoded string.
- An email.Errors.HeaderParseError may be raised when certain decoding error
+ header may be a string that may or may not contain RFC2047 encoded words,
+ or it may be a Header object.
+
+ An email.errors.HeaderParseError may be raised when certain decoding error
occurs (e.g. a base64 decoding exception).
"""
+ # If it is a Header object, we can just return the chunks.
+ if hasattr(header, '_chunks'):
+ return list(header._chunks)
# If no encoding, just return the header with no charset.
if not ecre.search(header):
return [(header, None)]
@@ -214,6 +221,9 @@ class Header:
# from a charset to None/us-ascii, or from None/us-ascii to a
# charset. Only do this for the second and subsequent chunks.
nextcs = charset
+ if nextcs == _charset.UNKNOWN8BIT:
+ original_bytes = string.encode('ascii', 'surrogateescape')
+ string = original_bytes.decode('ascii', 'replace')
if uchunks:
if lastcs not in (None, 'us-ascii'):
if nextcs in (None, 'us-ascii'):
@@ -267,11 +277,12 @@ class Header:
# Ensure that the bytes we're storing can be decoded to the output
# character set, otherwise an early error is thrown.
output_charset = charset.output_codec or 'us-ascii'
- s.encode(output_charset, errors)
+ if output_charset != _charset.UNKNOWN8BIT:
+ s.encode(output_charset, errors)
self._chunks.append((s, charset))
- def encode(self, splitchars=';, \t', maxlinelen=None):
- """Encode a message header into an RFC-compliant format.
+ def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
+ r"""Encode a message header into an RFC-compliant format.
There are many issues involved in converting a given string for use in
an email header. Only certain character sets are readable in most
@@ -291,6 +302,11 @@ class Header:
Optional splitchars is a string containing characters to split long
ASCII lines on, in rough support of RFC 2822's `highest level
syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
+
+ Optional linesep is a string to be used to separate the lines of
+ the value. The default value is the most useful for typical
+ Python applications, but it can be set to \r\n to produce RFC-compliant
+ line separators when needed.
"""
self._normalize()
if maxlinelen is None:
@@ -314,7 +330,7 @@ class Header:
if len(lines) > 1:
formatter.newline()
formatter.add_transition()
- value = str(formatter)
+ value = formatter._str(linesep)
if _embeded_header.search(value):
raise HeaderParseError("header value appears to contain "
"an embedded header: {!r}".format(value))
@@ -349,9 +365,12 @@ class _ValueFormatter:
self._lines = []
self._current_line = _Accumulator(headerlen)
- def __str__(self):
+ def _str(self, linesep):
self.newline()
- return NL.join(self._lines)
+ return linesep.join(self._lines)
+
+ def __str__(self):
+ return self._str(NL)
def newline(self):
end_of_line = self._current_line.pop()
diff --git a/Lib/email/message.py b/Lib/email/message.py
index d30f109..922617a 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -16,7 +16,9 @@ from io import BytesIO, StringIO
# Intrapackage imports
from email import utils
from email import errors
-from email.charset import Charset
+from email import header
+from email import charset as _charset
+Charset = _charset.Charset
SEMISPACE = '; '
@@ -24,14 +26,31 @@ SEMISPACE = '; '
# existence of which force quoting of the parameter value.
tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
+# How to figure out if we are processing strings that come from a byte
+# source with undecodable characters.
+_has_surrogates = re.compile(
+ '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+
# Helper functions
+def _sanitize_header(name, value):
+ # If the header value contains surrogates, return a Header using
+ # the unknown-8bit charset to encode the bytes as encoded words.
+ if not isinstance(value, str):
+ # Assume it is already a header object
+ return value
+ if _has_surrogates(value):
+ return header.Header(value, charset=_charset.UNKNOWN8BIT,
+ header_name=name)
+ else:
+ return value
+
def _splitparam(param):
# Split header parameters. BAW: this may be too simple. It isn't
# strictly RFC 2045 (section 5.1) compliant, but it catches most headers
- # found in the wild. We may eventually need a full fledged parser
- # eventually.
- a, sep, b = param.partition(';')
+ # found in the wild. We may eventually need a full fledged parser.
+ # RDM: we might have a Header here; for now just stringify it.
+ a, sep, b = str(param).partition(';')
if not sep:
return a.strip(), None
return a.strip(), b.strip()
@@ -48,17 +67,19 @@ def _formatparam(param, value=None, quote=True):
if value is not None and len(value) > 0:
# A tuple is used for RFC 2231 encoded parameter values where items
# are (charset, language, value). charset is a string, not a Charset
- # instance.
+ # instance. RFC 2231 encoded values are never quoted, per RFC.
if isinstance(value, tuple):
# Encode as per RFC 2231
param += '*'
value = utils.encode_rfc2231(value[2], value[0], value[1])
+ return '%s=%s' % (param, value)
else:
try:
value.encode('ascii')
except UnicodeEncodeError:
param += '*'
value = utils.encode_rfc2231(value, 'utf-8', '')
+ return '%s=%s' % (param, value)
# BAW: Please check this. I think that if quote is set it should
# force quoting even if not necessary.
if quote or tspecials.search(value):
@@ -69,6 +90,8 @@ def _formatparam(param, value=None, quote=True):
return param
def _parseparam(s):
+ # RDM This might be a Header, so for now stringify it.
+ s = ';' + str(s)
plist = []
while s[:1] == ';':
s = s[1:]
@@ -193,43 +216,73 @@ class Message:
If the message is a multipart and the decode flag is True, then None
is returned.
"""
- if i is None:
- payload = self._payload
- elif not isinstance(self._payload, list):
+ # Here is the logic table for this code, based on the email5.0.0 code:
+ # i decode is_multipart result
+ # ------ ------ ------------ ------------------------------
+ # None True True None
+ # i True True None
+ # None False True _payload (a list)
+ # i False True _payload element i (a Message)
+ # i False False error (not a list)
+ # i True False error (not a list)
+ # None False False _payload
+ # None True False _payload decoded (bytes)
+ # Note that Barry planned to factor out the 'decode' case, but that
+ # isn't so easy now that we handle the 8 bit data, which needs to be
+ # converted in both the decode and non-decode path.
+ if self.is_multipart():
+ if decode:
+ return None
+ if i is None:
+ return self._payload
+ else:
+ return self._payload[i]
+ # For backward compatibility, Use isinstance and this error message
+ # instead of the more logical is_multipart test.
+ if i is not None and not isinstance(self._payload, list):
raise TypeError('Expected list, got %s' % type(self._payload))
- else:
- payload = self._payload[i]
+ payload = self._payload
+ # cte might be a Header, so for now stringify it.
+ cte = str(self.get('content-transfer-encoding', '')).lower()
+ # payload may be bytes here.
+ if isinstance(payload, str):
+ if _has_surrogates(payload):
+ bpayload = payload.encode('ascii', 'surrogateescape')
+ if not decode:
+ try:
+ payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
+ except LookupError:
+ payload = bpayload.decode('ascii', 'replace')
+ elif decode:
+ try:
+ bpayload = payload.encode('ascii')
+ except UnicodeError:
+ # This won't happen for RFC compliant messages (messages
+ # containing only ASCII codepoints in the unicode input).
+ # If it does happen, turn the string into bytes in a way
+ # guaranteed not to fail.
+ bpayload = payload.encode('raw-unicode-escape')
if not decode:
return payload
- # Decoded payloads always return bytes. XXX split this part out into
- # a new method called .get_decoded_payload().
- if self.is_multipart():
- return None
- cte = self.get('content-transfer-encoding', '').lower()
if cte == 'quoted-printable':
- return utils._qdecode(payload)
+ return utils._qdecode(bpayload)
elif cte == 'base64':
try:
- if isinstance(payload, str):
- payload = payload.encode('raw-unicode-escape')
- return base64.b64decode(payload)
- #return utils._bdecode(payload)
+ return base64.b64decode(bpayload)
except binascii.Error:
# Incorrect padding
- pass
+ return bpayload
elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
- in_file = BytesIO(payload.encode('raw-unicode-escape'))
+ in_file = BytesIO(bpayload)
out_file = BytesIO()
try:
uu.decode(in_file, out_file, quiet=True)
return out_file.getvalue()
except uu.Error:
# Some decoding problem
- pass
- # Is there a better way to do this? We can't use the bytes
- # constructor.
+ return bpayload
if isinstance(payload, str):
- return payload.encode('raw-unicode-escape')
+ return bpayload
return payload
def set_payload(self, payload, charset=None):
@@ -348,7 +401,7 @@ class Message:
Any fields deleted and re-inserted are always appended to the header
list.
"""
- return [v for k, v in self._headers]
+ return [_sanitize_header(k, v) for k, v in self._headers]
def items(self):
"""Get all the message's header fields and values.
@@ -358,7 +411,7 @@ class Message:
Any fields deleted and re-inserted are always appended to the header
list.
"""
- return self._headers[:]
+ return [(k, _sanitize_header(k, v)) for k, v in self._headers]
def get(self, name, failobj=None):
"""Get a header value.
@@ -369,7 +422,7 @@ class Message:
name = name.lower()
for k, v in self._headers:
if k.lower() == name:
- return v
+ return _sanitize_header(k, v)
return failobj
#
@@ -389,7 +442,7 @@ class Message:
name = name.lower()
for k, v in self._headers:
if k.lower() == name:
- values.append(v)
+ values.append(_sanitize_header(k, v))
if not values:
return failobj
return values
@@ -511,7 +564,7 @@ class Message:
if value is missing:
return failobj
params = []
- for p in _parseparam(';' + value):
+ for p in _parseparam(value):
try:
name, val = p.split('=', 1)
name = name.strip()
diff --git a/Lib/email/parser.py b/Lib/email/parser.py
index 06014e2..6caaff5 100644
--- a/Lib/email/parser.py
+++ b/Lib/email/parser.py
@@ -7,7 +7,7 @@
__all__ = ['Parser', 'HeaderParser']
import warnings
-from io import StringIO
+from io import StringIO, TextIOWrapper
from email.feedparser import FeedParser
from email.message import Message
@@ -89,3 +89,48 @@ class HeaderParser(Parser):
def parsestr(self, text, headersonly=True):
return Parser.parsestr(self, text, True)
+
+
+class BytesParser:
+
+ def __init__(self, *args, **kw):
+ """Parser of binary RFC 2822 and MIME email messages.
+
+ Creates an in-memory object tree representing the email message, which
+ can then be manipulated and turned over to a Generator to return the
+ textual representation of the message.
+
+ The input must be formatted as a block of RFC 2822 headers and header
+ continuation lines, optionally preceeded by a `Unix-from' header. The
+ header block is terminated either by the end of the input or by a
+ blank line.
+
+ _class is the class to instantiate for new message objects when they
+ must be created. This class must have a constructor that can take
+ zero arguments. Default is Message.Message.
+ """
+ self.parser = Parser(*args, **kw)
+
+ def parse(self, fp, headersonly=False):
+ """Create a message structure from the data in a binary file.
+
+ Reads all the data from the file and returns the root of the message
+ structure. Optional headersonly is a flag specifying whether to stop
+ parsing after reading the headers or not. The default is False,
+ meaning it parses the entire contents of the file.
+ """
+ fp = TextIOWrapper(fp, encoding='ascii', errors='surrogateescape')
+ with fp:
+ return self.parser.parse(fp, headersonly)
+
+
+ def parsebytes(self, text, headersonly=False):
+ """Create a message structure from a byte string.
+
+ Returns the root of the message structure. Optional headersonly is a
+ flag specifying whether to stop parsing after reading the headers or
+ not. The default is False, meaning it parses the entire contents of
+ the file.
+ """
+ text = text.decode('ASCII', errors='surrogateescape')
+ return self.parser.parsestr(text, headersonly)
diff --git a/Lib/email/quoprimime.py b/Lib/email/quoprimime.py
index f88115e..78638d5 100644
--- a/Lib/email/quoprimime.py
+++ b/Lib/email/quoprimime.py
@@ -11,7 +11,7 @@ character set, but that includes some 8-bit characters that are normally not
allowed in email bodies or headers.
Quoted-printable is very space-inefficient for encoding binary files; use the
-email.base64MIME module for that instead.
+email.base64mime module for that instead.
This module provides an interface to encode and decode both headers and bodies
with quoted-printable encoding.
@@ -23,7 +23,7 @@ in To:/From:/Cc: etc. fields, as well as Subject: lines.
This module does not do the line wrapping or end-of-line character
conversion necessary for proper internationalized headers; it only
does dumb encoding and decoding. To deal with the various line
-wrapping issues, use the email.Header module.
+wrapping issues, use the email.header module.
"""
__all__ = [
@@ -316,7 +316,7 @@ def header_decode(s):
This function does not parse a full MIME header value encoded with
quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
- the high level email.Header class for that functionality.
+ the high level email.header class for that functionality.
"""
s = s.replace('_', ' ')
return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, re.ASCII)
diff --git a/Lib/email/test/data/msg_10.txt b/Lib/email/test/data/msg_10.txt
index bd30d13..0790396 100644
--- a/Lib/email/test/data/msg_10.txt
+++ b/Lib/email/test/data/msg_10.txt
@@ -26,6 +26,13 @@ VGhpcyBpcyBhIEJhc2U2NCBlbmNvZGVkIG1lc3NhZ2Uu
--BOUNDARY
Content-Type: text/plain; charset="iso-8859-1"
+Content-Transfer-Encoding: Base64
+
+VGhpcyBpcyBhIEJhc2U2NCBlbmNvZGVkIG1lc3NhZ2UuCg==
+
+
+--BOUNDARY
+Content-Type: text/plain; charset="iso-8859-1"
This has no Content-Transfer-Encoding: header.
diff --git a/Lib/email/test/data/msg_26.txt b/Lib/email/test/data/msg_26.txt
index 6c71bce..58efaa9 100644
--- a/Lib/email/test/data/msg_26.txt
+++ b/Lib/email/test/data/msg_26.txt
@@ -24,7 +24,8 @@ Simple email with attachment.
--1618492860--2051301190--113853680
-Content-Type: application/riscos; name="clock.bmp,69c"; type=BMP; load=&fff69c4b; exec=&355dd4d1; access=&03
+Content-Type: application/riscos; name="clock.bmp,69c"; type=BMP;
+ load=&fff69c4b; exec=&355dd4d1; access=&03
Content-Disposition: attachment; filename="clock.bmp"
Content-Transfer-Encoding: base64
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index 5b8b7bf..48c3c59 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -3,6 +3,7 @@
# email package unit tests
import os
+import re
import sys
import time
import base64
@@ -11,7 +12,7 @@ import unittest
import warnings
import textwrap
-from io import StringIO
+from io import StringIO, BytesIO
from itertools import chain
import email
@@ -35,7 +36,7 @@ from email import iterators
from email import base64mime
from email import quoprimime
-from test.support import findfile, run_unittest
+from test.support import findfile, run_unittest, unlink
from email.test import __file__ as landmark
@@ -193,8 +194,8 @@ class TestMessageAPI(TestEmailBase):
def test_message_rfc822_only(self):
# Issue 7970: message/rfc822 not in multipart parsed by
# HeaderParser caused an exception when flattened.
- fp = openfile(findfile('msg_46.txt'))
- msgdata = fp.read()
+ with openfile(findfile('msg_46.txt')) as fp:
+ msgdata = fp.read()
parser = HeaderParser()
msg = parser.parsestr(msgdata)
out = StringIO()
@@ -216,8 +217,12 @@ class TestMessageAPI(TestEmailBase):
# Subpart 3 is base64
eq(msg.get_payload(2).get_payload(decode=True),
b'This is a Base64 encoded message.')
- # Subpart 4 has no Content-Transfer-Encoding: header.
+ # Subpart 4 is base64 with a trailing newline, which
+ # used to be stripped (issue 7143).
eq(msg.get_payload(3).get_payload(decode=True),
+ b'This is a Base64 encoded message.\n')
+ # Subpart 5 has no Content-Transfer-Encoding: header.
+ eq(msg.get_payload(4).get_payload(decode=True),
b'This has no Content-Transfer-Encoding: header.\n')
def test_get_decoded_uu_payload(self):
@@ -529,7 +534,7 @@ class TestMessageAPI(TestEmailBase):
msg.add_header('Content-Disposition', 'attachment',
filename="Fußballer.ppt")
self.assertEqual(
- 'attachment; filename*="utf-8\'\'Fu%C3%9Fballer.ppt"',
+ 'attachment; filename*=utf-8\'\'Fu%C3%9Fballer.ppt',
msg['Content-Disposition'])
def test_nonascii_add_header_via_triple(self):
@@ -537,9 +542,24 @@ class TestMessageAPI(TestEmailBase):
msg.add_header('Content-Disposition', 'attachment',
filename=('iso-8859-1', '', 'Fußballer.ppt'))
self.assertEqual(
- 'attachment; filename*="iso-8859-1\'\'Fu%DFballer.ppt"',
+ 'attachment; filename*=iso-8859-1\'\'Fu%DFballer.ppt',
msg['Content-Disposition'])
+ def test_ascii_add_header_with_tspecial(self):
+ msg = Message()
+ msg.add_header('Content-Disposition', 'attachment',
+ filename="windows [filename].ppt")
+ self.assertEqual(
+ 'attachment; filename="windows [filename].ppt"',
+ msg['Content-Disposition'])
+
+ def test_nonascii_add_header_with_tspecial(self):
+ msg = Message()
+ msg.add_header('Content-Disposition', 'attachment',
+ filename="Fußballer [filename].ppt")
+ self.assertEqual(
+ "attachment; filename*=utf-8''Fu%C3%9Fballer%20%5Bfilename%5D.ppt",
+ msg['Content-Disposition'])
# Issue 5871: reject an attempt to embed a header inside a header value
# (header injection attack).
@@ -714,6 +734,20 @@ wasnipoop; giraffes="very-long-necked-animals";
wasnipoop; giraffes="very-long-necked-animals";
\tspooge="yummy"; hippos="gargantuan"; marshmallows="gooey"''')
+ def test_header_encode_with_different_output_charset(self):
+ h = Header('文', 'euc-jp')
+ self.assertEqual(h.encode(), "=?iso-2022-jp?b?GyRCSjgbKEI=?=")
+
+ def test_long_header_encode_with_different_output_charset(self):
+ h = Header(b'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5\xa4\xec\xa4'
+ b'\xbf\xa5\xe1\xa1\xbc\xa5\xeb\xa4\xcf\xbb\xca\xb2\xf1\xbc\xd4'
+ b'\xa4\xce\xbe\xb5\xc7\xa7\xa4\xf2\xc2\xd4\xa4\xc3\xa4\xc6\xa4'
+ b'\xa4\xa4\xde\xa4\xb9'.decode('euc-jp'), 'euc-jp')
+ res = """\
+=?iso-2022-jp?b?dGVzdC1qYSAbJEIkWEVqOUYkNSRsJD8lYSE8JWskTztKMnE8VCROPjUbKEI=?=
+ =?iso-2022-jp?b?GyRCRyckckJUJEMkRiQkJF4kORsoQg==?="""
+ self.assertEqual(h.encode(), res)
+
def test_header_splitter(self):
eq = self.ndiffAssertEqual
msg = MIMEText('')
@@ -2083,17 +2117,20 @@ message 2
# should be identical. Note: that we ignore the Unix-From since that may
# contain a changed date.
class TestIdempotent(TestEmailBase):
+
+ linesep = '\n'
+
def _msgobj(self, filename):
with openfile(filename) as fp:
data = fp.read()
msg = email.message_from_string(data)
return msg, data
- def _idempotent(self, msg, text):
+ def _idempotent(self, msg, text, unixfrom=False):
eq = self.ndiffAssertEqual
s = StringIO()
g = Generator(s, maxheaderlen=0)
- g.flatten(msg)
+ g.flatten(msg, unixfrom=unixfrom)
eq(text, s.getvalue())
def test_parse_text_message(self):
@@ -2180,6 +2217,14 @@ class TestIdempotent(TestEmailBase):
msg, text = self._msgobj('msg_36.txt')
self._idempotent(msg, text)
+ def test_message_delivery_status(self):
+ msg, text = self._msgobj('msg_43.txt')
+ self._idempotent(msg, text, unixfrom=True)
+
+ def test_message_signed_idempotent(self):
+ msg, text = self._msgobj('msg_45.txt')
+ self._idempotent(msg, text)
+
def test_content_type(self):
eq = self.assertEqual
unless = self.assertTrue
@@ -2192,16 +2237,16 @@ class TestIdempotent(TestEmailBase):
params[pk] = pv
eq(params['report-type'], 'delivery-status')
eq(params['boundary'], 'D1690A7AC1.996856090/mail.example.com')
- eq(msg.preamble, 'This is a MIME-encapsulated message.\n')
- eq(msg.epilogue, '\n')
+ eq(msg.preamble, 'This is a MIME-encapsulated message.' + self.linesep)
+ eq(msg.epilogue, self.linesep)
eq(len(msg.get_payload()), 3)
# Make sure the subparts are what we expect
msg1 = msg.get_payload(0)
eq(msg1.get_content_type(), 'text/plain')
- eq(msg1.get_payload(), 'Yadda yadda yadda\n')
+ eq(msg1.get_payload(), 'Yadda yadda yadda' + self.linesep)
msg2 = msg.get_payload(1)
eq(msg2.get_content_type(), 'text/plain')
- eq(msg2.get_payload(), 'Yadda yadda yadda\n')
+ eq(msg2.get_payload(), 'Yadda yadda yadda' + self.linesep)
msg3 = msg.get_payload(2)
eq(msg3.get_content_type(), 'message/rfc822')
self.assertTrue(isinstance(msg3, Message))
@@ -2210,7 +2255,7 @@ class TestIdempotent(TestEmailBase):
eq(len(payload), 1)
msg4 = payload[0]
unless(isinstance(msg4, Message))
- eq(msg4.get_payload(), 'Yadda yadda yadda\n')
+ eq(msg4.get_payload(), 'Yadda yadda yadda' + self.linesep)
def test_parser(self):
eq = self.assertEqual
@@ -2227,7 +2272,7 @@ class TestIdempotent(TestEmailBase):
self.assertTrue(isinstance(msg1, Message))
eq(msg1.get_content_type(), 'text/plain')
self.assertTrue(isinstance(msg1.get_payload(), str))
- eq(msg1.get_payload(), '\n')
+ eq(msg1.get_payload(), self.linesep)
@@ -2296,7 +2341,8 @@ class TestMiscellaneous(TestEmailBase):
all.sort()
self.assertEqual(all, [
'base64mime', 'charset', 'encoders', 'errors', 'generator',
- 'header', 'iterators', 'message', 'message_from_file',
+ 'header', 'iterators', 'message', 'message_from_binary_file',
+ 'message_from_bytes', 'message_from_file',
'message_from_string', 'mime', 'parser',
'quoprimime', 'utils',
])
@@ -2339,6 +2385,16 @@ class TestMiscellaneous(TestEmailBase):
eq(utils.parsedate_tz('5 Feb 2003 13:47:26 -0800'),
(2003, 2, 5, 13, 47, 26, 0, 1, -1, -28800))
+ def test_parsedate_no_space_before_positive_offset(self):
+ self.assertEqual(utils.parsedate_tz('Wed, 3 Apr 2002 14:58:26+0800'),
+ (2002, 4, 3, 14, 58, 26, 0, 1, -1, 28800))
+
+ def test_parsedate_no_space_before_negative_offset(self):
+ # Issue 1155362: we already handled '+' for this case.
+ self.assertEqual(utils.parsedate_tz('Wed, 3 Apr 2002 14:58:26-0800'),
+ (2002, 4, 3, 14, 58, 26, 0, 1, -1, -28800))
+
+
def test_parsedate_acceptable_to_time_functions(self):
eq = self.assertEqual
timetup = utils.parsedate('5 Feb 2003 13:47:26 -0800')
@@ -2415,6 +2471,24 @@ class TestMiscellaneous(TestEmailBase):
eq(utils.parseaddr('"\\\\"example\\\\" example"@example.com'),
('', '"\\\\"example\\\\" example"@example.com'))
+ def test_parseaddr_preserves_spaces_in_local_part(self):
+ # issue 9286. A normal RFC5322 local part should not contain any
+ # folding white space, but legacy local parts can (they are a sequence
+ # of atoms, not dotatoms). On the other hand we strip whitespace from
+ # before the @ and around dots, on the assumption that the whitespace
+ # around the punctuation is a mistake in what would otherwise be
+ # an RFC5322 local part. Leading whitespace is, usual, stripped as well.
+ self.assertEqual(('', "merwok wok@xample.com"),
+ utils.parseaddr("merwok wok@xample.com"))
+ self.assertEqual(('', "merwok wok@xample.com"),
+ utils.parseaddr("merwok wok@xample.com"))
+ self.assertEqual(('', "merwok wok@xample.com"),
+ utils.parseaddr(" merwok wok @xample.com"))
+ self.assertEqual(('', 'merwok"wok" wok@xample.com'),
+ utils.parseaddr('merwok"wok" wok@xample.com'))
+ self.assertEqual(('', 'merwok.wok.wok@xample.com'),
+ utils.parseaddr('merwok. wok . wok@xample.com'))
+
def test_multiline_from_comment(self):
x = """\
Foo
@@ -2553,6 +2627,10 @@ multipart/report
text/rfc822-headers
""")
+ def test_make_msgid_domain(self):
+ self.assertEqual(
+ email.utils.make_msgid(domain='testdomain-string')[-19:],
+ '@testdomain-string>')
# Test the iterator/generators
@@ -2704,6 +2782,18 @@ Here's the message body
part2 = msg.get_payload(1)
eq(part2.get_content_type(), 'application/riscos')
+ def test_crlf_flatten(self):
+ # Using newline='\n' preserves the crlfs in this input file.
+ with openfile('msg_26.txt', newline='\n') as fp:
+ text = fp.read()
+ msg = email.message_from_string(text)
+ s = StringIO()
+ g = Generator(s)
+ g.flatten(msg, linesep='\r\n')
+ self.assertEqual(s.getvalue(), text)
+
+ maxDiff = None
+
def test_multipart_digest_with_extra_mime_headers(self):
eq = self.assertEqual
neq = self.ndiffAssertEqual
@@ -2797,6 +2887,417 @@ Here's the message body
self.assertTrue(msg.get_payload(0).get_payload().endswith('\r\n'))
+class Test8BitBytesHandling(unittest.TestCase):
+ # In Python3 all input is string, but that doesn't work if the actual input
+ # uses an 8bit transfer encoding. To hack around that, in email 5.1 we
+ # decode byte streams using the surrogateescape error handler, and
+ # reconvert to binary at appropriate places if we detect surrogates. This
+ # doesn't allow us to transform headers with 8bit bytes (they get munged),
+ # but it does allow us to parse and preserve them, and to decode body
+ # parts that use an 8bit CTE.
+
+ bodytest_msg = textwrap.dedent("""\
+ From: foo@bar.com
+ To: baz
+ Mime-Version: 1.0
+ Content-Type: text/plain; charset={charset}
+ Content-Transfer-Encoding: {cte}
+
+ {bodyline}
+ """)
+
+ def test_known_8bit_CTE(self):
+ m = self.bodytest_msg.format(charset='utf-8',
+ cte='8bit',
+ bodyline='pöstal').encode('utf-8')
+ msg = email.message_from_bytes(m)
+ self.assertEqual(msg.get_payload(), "pöstal\n")
+ self.assertEqual(msg.get_payload(decode=True),
+ "pöstal\n".encode('utf-8'))
+
+ def test_unknown_8bit_CTE(self):
+ m = self.bodytest_msg.format(charset='notavalidcharset',
+ cte='8bit',
+ bodyline='pöstal').encode('utf-8')
+ msg = email.message_from_bytes(m)
+ self.assertEqual(msg.get_payload(), "p\uFFFD\uFFFDstal\n")
+ self.assertEqual(msg.get_payload(decode=True),
+ "pöstal\n".encode('utf-8'))
+
+ def test_8bit_in_quopri_body(self):
+ # This is non-RFC compliant data...without 'decode' the library code
+ # decodes the body using the charset from the headers, and because the
+ # source byte really is utf-8 this works. This is likely to fail
+ # against real dirty data (ie: produce mojibake), but the data is
+ # invalid anyway so it is as good a guess as any. But this means that
+ # this test just confirms the current behavior; that behavior is not
+ # necessarily the best possible behavior. With 'decode' it is
+ # returning the raw bytes, so that test should be of correct behavior,
+ # or at least produce the same result that email4 did.
+ m = self.bodytest_msg.format(charset='utf-8',
+ cte='quoted-printable',
+ bodyline='p=C3=B6stál').encode('utf-8')
+ msg = email.message_from_bytes(m)
+ self.assertEqual(msg.get_payload(), 'p=C3=B6stál\n')
+ self.assertEqual(msg.get_payload(decode=True),
+ 'pöstál\n'.encode('utf-8'))
+
+ def test_invalid_8bit_in_non_8bit_cte_uses_replace(self):
+ # This is similar to the previous test, but proves that if the 8bit
+ # byte is undecodeable in the specified charset, it gets replaced
+ # by the unicode 'unknown' character. Again, this may or may not
+ # be the ideal behavior. Note that if decode=False none of the
+ # decoders will get involved, so this is the only test we need
+ # for this behavior.
+ m = self.bodytest_msg.format(charset='ascii',
+ cte='quoted-printable',
+ bodyline='p=C3=B6stál').encode('utf-8')
+ msg = email.message_from_bytes(m)
+ self.assertEqual(msg.get_payload(), 'p=C3=B6st\uFFFD\uFFFDl\n')
+ self.assertEqual(msg.get_payload(decode=True),
+ 'pöstál\n'.encode('utf-8'))
+
+ def test_8bit_in_base64_body(self):
+ # Sticking an 8bit byte in a base64 block makes it undecodable by
+ # normal means, so the block is returned undecoded, but as bytes.
+ m = self.bodytest_msg.format(charset='utf-8',
+ cte='base64',
+ bodyline='cMO2c3RhbAá=').encode('utf-8')
+ msg = email.message_from_bytes(m)
+ self.assertEqual(msg.get_payload(decode=True),
+ 'cMO2c3RhbAá=\n'.encode('utf-8'))
+
+ def test_8bit_in_uuencode_body(self):
+ # Sticking an 8bit byte in a uuencode block makes it undecodable by
+ # normal means, so the block is returned undecoded, but as bytes.
+ m = self.bodytest_msg.format(charset='utf-8',
+ cte='uuencode',
+ bodyline='<,.V<W1A; á ').encode('utf-8')
+ msg = email.message_from_bytes(m)
+ self.assertEqual(msg.get_payload(decode=True),
+ '<,.V<W1A; á \n'.encode('utf-8'))
+
+
+ headertest_headers = (
+ ('From: foo@bar.com', ('From', 'foo@bar.com')),
+ ('To: báz', ('To', '=?unknown-8bit?q?b=C3=A1z?=')),
+ ('Subject: Maintenant je vous présente mon collègue, le pouf célèbre\n'
+ '\tJean de Baddie',
+ ('Subject', '=?unknown-8bit?q?Maintenant_je_vous_pr=C3=A9sente_mon_'
+ 'coll=C3=A8gue=2C_le_pouf_c=C3=A9l=C3=A8bre?=\n'
+ ' =?unknown-8bit?q?_Jean_de_Baddie?=')),
+ ('From: göst', ('From', '=?unknown-8bit?b?Z8O2c3Q=?=')),
+ )
+ headertest_msg = ('\n'.join([src for (src, _) in headertest_headers]) +
+ '\nYes, they are flying.\n').encode('utf-8')
+
+ def test_get_8bit_header(self):
+ msg = email.message_from_bytes(self.headertest_msg)
+ self.assertEqual(str(msg.get('to')), 'b\uFFFD\uFFFDz')
+ self.assertEqual(str(msg['to']), 'b\uFFFD\uFFFDz')
+
+ def test_print_8bit_headers(self):
+ msg = email.message_from_bytes(self.headertest_msg)
+ self.assertEqual(str(msg),
+ textwrap.dedent("""\
+ From: {}
+ To: {}
+ Subject: {}
+ From: {}
+
+ Yes, they are flying.
+ """).format(*[expected[1] for (_, expected) in
+ self.headertest_headers]))
+
+ def test_values_with_8bit_headers(self):
+ msg = email.message_from_bytes(self.headertest_msg)
+ self.assertListEqual([str(x) for x in msg.values()],
+ ['foo@bar.com',
+ 'b\uFFFD\uFFFDz',
+ 'Maintenant je vous pr\uFFFD\uFFFDsente mon '
+ 'coll\uFFFD\uFFFDgue, le pouf '
+ 'c\uFFFD\uFFFDl\uFFFD\uFFFDbre\n'
+ '\tJean de Baddie',
+ "g\uFFFD\uFFFDst"])
+
+ def test_items_with_8bit_headers(self):
+ msg = email.message_from_bytes(self.headertest_msg)
+ self.assertListEqual([(str(x), str(y)) for (x, y) in msg.items()],
+ [('From', 'foo@bar.com'),
+ ('To', 'b\uFFFD\uFFFDz'),
+ ('Subject', 'Maintenant je vous '
+ 'pr\uFFFD\uFFFDsente '
+ 'mon coll\uFFFD\uFFFDgue, le pouf '
+ 'c\uFFFD\uFFFDl\uFFFD\uFFFDbre\n'
+ '\tJean de Baddie'),
+ ('From', 'g\uFFFD\uFFFDst')])
+
+ def test_get_all_with_8bit_headers(self):
+ msg = email.message_from_bytes(self.headertest_msg)
+ self.assertListEqual([str(x) for x in msg.get_all('from')],
+ ['foo@bar.com',
+ 'g\uFFFD\uFFFDst'])
+
+ def test_get_content_type_with_8bit(self):
+ msg = email.message_from_bytes(textwrap.dedent("""\
+ Content-Type: text/pl\xA7in; charset=utf-8
+ """).encode('latin-1'))
+ self.assertEqual(msg.get_content_type(), "text/pl\uFFFDin")
+ self.assertEqual(msg.get_content_maintype(), "text")
+ self.assertEqual(msg.get_content_subtype(), "pl\uFFFDin")
+
+ def test_get_params_with_8bit(self):
+ msg = email.message_from_bytes(
+ 'X-Header: foo=\xa7ne; b\xa7r=two; baz=three\n'.encode('latin-1'))
+ self.assertEqual(msg.get_params(header='x-header'),
+ [('foo', '\uFFFDne'), ('b\uFFFDr', 'two'), ('baz', 'three')])
+ self.assertEqual(msg.get_param('Foo', header='x-header'), '\uFFFdne')
+ # XXX: someday you might be able to get 'b\xa7r', for now you can't.
+ self.assertEqual(msg.get_param('b\xa7r', header='x-header'), None)
+
+ def test_get_rfc2231_params_with_8bit(self):
+ msg = email.message_from_bytes(textwrap.dedent("""\
+ Content-Type: text/plain; charset=us-ascii;
+ title*=us-ascii'en'This%20is%20not%20f\xa7n"""
+ ).encode('latin-1'))
+ self.assertEqual(msg.get_param('title'),
+ ('us-ascii', 'en', 'This is not f\uFFFDn'))
+
+ def test_set_rfc2231_params_with_8bit(self):
+ msg = email.message_from_bytes(textwrap.dedent("""\
+ Content-Type: text/plain; charset=us-ascii;
+ title*=us-ascii'en'This%20is%20not%20f\xa7n"""
+ ).encode('latin-1'))
+ msg.set_param('title', 'test')
+ self.assertEqual(msg.get_param('title'), 'test')
+
+ def test_del_rfc2231_params_with_8bit(self):
+ msg = email.message_from_bytes(textwrap.dedent("""\
+ Content-Type: text/plain; charset=us-ascii;
+ title*=us-ascii'en'This%20is%20not%20f\xa7n"""
+ ).encode('latin-1'))
+ msg.del_param('title')
+ self.assertEqual(msg.get_param('title'), None)
+ self.assertEqual(msg.get_content_maintype(), 'text')
+
+ def test_get_payload_with_8bit_cte_header(self):
+ msg = email.message_from_bytes(textwrap.dedent("""\
+ Content-Transfer-Encoding: b\xa7se64
+ Content-Type: text/plain; charset=latin-1
+
+ payload
+ """).encode('latin-1'))
+ self.assertEqual(msg.get_payload(), 'payload\n')
+ self.assertEqual(msg.get_payload(decode=True), b'payload\n')
+
+ non_latin_bin_msg = textwrap.dedent("""\
+ From: foo@bar.com
+ To: báz
+ Subject: Maintenant je vous présente mon collègue, le pouf célèbre
+ \tJean de Baddie
+ Mime-Version: 1.0
+ Content-Type: text/plain; charset="utf-8"
+ Content-Transfer-Encoding: 8bit
+
+ Да, они летят.
+ """).encode('utf-8')
+
+ def test_bytes_generator(self):
+ msg = email.message_from_bytes(self.non_latin_bin_msg)
+ out = BytesIO()
+ email.generator.BytesGenerator(out).flatten(msg)
+ self.assertEqual(out.getvalue(), self.non_latin_bin_msg)
+
+ def test_bytes_generator_handles_None_body(self):
+ #Issue 11019
+ msg = email.message.Message()
+ out = BytesIO()
+ email.generator.BytesGenerator(out).flatten(msg)
+ self.assertEqual(out.getvalue(), b"\n")
+
+ non_latin_bin_msg_as7bit_wrapped = textwrap.dedent("""\
+ From: foo@bar.com
+ To: =?unknown-8bit?q?b=C3=A1z?=
+ Subject: =?unknown-8bit?q?Maintenant_je_vous_pr=C3=A9sente_mon_coll=C3=A8gue?=
+ =?unknown-8bit?q?=2C_le_pouf_c=C3=A9l=C3=A8bre?=
+ =?unknown-8bit?q?_Jean_de_Baddie?=
+ Mime-Version: 1.0
+ Content-Type: text/plain; charset="utf-8"
+ Content-Transfer-Encoding: base64
+
+ 0JTQsCwg0L7QvdC4INC70LXRgtGP0YIuCg==
+ """)
+
+ def test_generator_handles_8bit(self):
+ msg = email.message_from_bytes(self.non_latin_bin_msg)
+ out = StringIO()
+ email.generator.Generator(out).flatten(msg)
+ self.assertEqual(out.getvalue(), self.non_latin_bin_msg_as7bit_wrapped)
+
+ def test_bytes_generator_with_unix_from(self):
+ # The unixfrom contains a current date, so we can't check it
+ # literally. Just make sure the first word is 'From' and the
+ # rest of the message matches the input.
+ msg = email.message_from_bytes(self.non_latin_bin_msg)
+ out = BytesIO()
+ email.generator.BytesGenerator(out).flatten(msg, unixfrom=True)
+ lines = out.getvalue().split(b'\n')
+ self.assertEqual(lines[0].split()[0], b'From')
+ self.assertEqual(b'\n'.join(lines[1:]), self.non_latin_bin_msg)
+
+ non_latin_bin_msg_as7bit = non_latin_bin_msg_as7bit_wrapped.split('\n')
+ non_latin_bin_msg_as7bit[2:4] = [
+ 'Subject: =?unknown-8bit?q?Maintenant_je_vous_pr=C3=A9sente_mon_'
+ 'coll=C3=A8gue=2C_le_pouf_c=C3=A9l=C3=A8bre?=']
+ non_latin_bin_msg_as7bit = '\n'.join(non_latin_bin_msg_as7bit)
+
+ def test_message_from_binary_file(self):
+ fn = 'test.msg'
+ self.addCleanup(unlink, fn)
+ with open(fn, 'wb') as testfile:
+ testfile.write(self.non_latin_bin_msg)
+ with open(fn, 'rb') as testfile:
+ m = email.parser.BytesParser().parse(testfile)
+ self.assertEqual(str(m), self.non_latin_bin_msg_as7bit)
+
+ latin_bin_msg = textwrap.dedent("""\
+ From: foo@bar.com
+ To: Dinsdale
+ Subject: Nudge nudge, wink, wink
+ Mime-Version: 1.0
+ Content-Type: text/plain; charset="latin-1"
+ Content-Transfer-Encoding: 8bit
+
+ oh là là, know what I mean, know what I mean?
+ """).encode('latin-1')
+
+ latin_bin_msg_as7bit = textwrap.dedent("""\
+ From: foo@bar.com
+ To: Dinsdale
+ Subject: Nudge nudge, wink, wink
+ Mime-Version: 1.0
+ Content-Type: text/plain; charset="iso-8859-1"
+ Content-Transfer-Encoding: quoted-printable
+
+ oh l=E0 l=E0, know what I mean, know what I mean?
+ """)
+
+ def test_string_generator_reencodes_to_quopri_when_appropriate(self):
+ m = email.message_from_bytes(self.latin_bin_msg)
+ self.assertEqual(str(m), self.latin_bin_msg_as7bit)
+
+ def test_decoded_generator_emits_unicode_body(self):
+ m = email.message_from_bytes(self.latin_bin_msg)
+ out = StringIO()
+ email.generator.DecodedGenerator(out).flatten(m)
+ #DecodedHeader output contains an extra blank line compared
+ #to the input message. RDM: not sure if this is a bug or not,
+ #but it is not specific to the 8bit->7bit conversion.
+ self.assertEqual(out.getvalue(),
+ self.latin_bin_msg.decode('latin-1')+'\n')
+
+ def test_bytes_feedparser(self):
+ bfp = email.feedparser.BytesFeedParser()
+ for i in range(0, len(self.latin_bin_msg), 10):
+ bfp.feed(self.latin_bin_msg[i:i+10])
+ m = bfp.close()
+ self.assertEqual(str(m), self.latin_bin_msg_as7bit)
+
+ def test_crlf_flatten(self):
+ with openfile('msg_26.txt', 'rb') as fp:
+ text = fp.read()
+ msg = email.message_from_bytes(text)
+ s = BytesIO()
+ g = email.generator.BytesGenerator(s)
+ g.flatten(msg, linesep='\r\n')
+ self.assertEqual(s.getvalue(), text)
+
+ def test_8bit_multipart(self):
+ # Issue 11605
+ source = textwrap.dedent("""\
+ Date: Fri, 18 Mar 2011 17:15:43 +0100
+ To: foo@example.com
+ From: foodwatch-Newsletter <bar@example.com>
+ Subject: Aktuelles zu Japan, Klonfleisch und Smiley-System
+ Message-ID: <76a486bee62b0d200f33dc2ca08220ad@localhost.localdomain>
+ MIME-Version: 1.0
+ Content-Type: multipart/alternative;
+ boundary="b1_76a486bee62b0d200f33dc2ca08220ad"
+
+ --b1_76a486bee62b0d200f33dc2ca08220ad
+ Content-Type: text/plain; charset="utf-8"
+ Content-Transfer-Encoding: 8bit
+
+ Guten Tag, ,
+
+ mit großer Betroffenheit verfolgen auch wir im foodwatch-Team die
+ Nachrichten aus Japan.
+
+
+ --b1_76a486bee62b0d200f33dc2ca08220ad
+ Content-Type: text/html; charset="utf-8"
+ Content-Transfer-Encoding: 8bit
+
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ "http://www.w3.org/TR/html4/loose.dtd">
+ <html lang="de">
+ <head>
+ <title>foodwatch - Newsletter</title>
+ </head>
+ <body>
+ <p>mit gro&szlig;er Betroffenheit verfolgen auch wir im foodwatch-Team
+ die Nachrichten aus Japan.</p>
+ </body>
+ </html>
+ --b1_76a486bee62b0d200f33dc2ca08220ad--
+
+ """).encode('utf-8')
+ msg = email.message_from_bytes(source)
+ s = BytesIO()
+ g = email.generator.BytesGenerator(s)
+ g.flatten(msg)
+ self.assertEqual(s.getvalue(), source)
+
+ maxDiff = None
+
+
+class BaseTestBytesGeneratorIdempotent:
+
+ maxDiff = None
+
+ def _msgobj(self, filename):
+ with openfile(filename, 'rb') as fp:
+ data = fp.read()
+ data = self.normalize_linesep_regex.sub(self.blinesep, data)
+ msg = email.message_from_bytes(data)
+ return msg, data
+
+ def _idempotent(self, msg, data, unixfrom=False):
+ b = BytesIO()
+ g = email.generator.BytesGenerator(b, maxheaderlen=0)
+ g.flatten(msg, unixfrom=unixfrom, linesep=self.linesep)
+ self.assertByteStringsEqual(data, b.getvalue())
+
+ def assertByteStringsEqual(self, str1, str2):
+ # Not using self.blinesep here is intentional. This way the output
+ # is more useful when the failure results in mixed line endings.
+ self.assertListEqual(str1.split(b'\n'), str2.split(b'\n'))
+
+
+class TestBytesGeneratorIdempotentNL(BaseTestBytesGeneratorIdempotent,
+ TestIdempotent):
+ linesep = '\n'
+ blinesep = b'\n'
+ normalize_linesep_regex = re.compile(br'\r\n')
+
+
+class TestBytesGeneratorIdempotentCRLF(BaseTestBytesGeneratorIdempotent,
+ TestIdempotent):
+ linesep = '\r\n'
+ blinesep = b'\r\n'
+ normalize_linesep_regex = re.compile(br'(?<!\r)\n')
+
+
class TestBase64(unittest.TestCase):
def test_len(self):
eq = self.assertEqual
@@ -3212,9 +3713,9 @@ class TestCharset(unittest.TestCase):
# built-in encodings where the header encoding is QP but the body
# encoding is not.
from email import charset as CharsetModule
- CharsetModule.add_charset('fake', CharsetModule.QP, None)
+ CharsetModule.add_charset('fake', CharsetModule.QP, None, 'utf-8')
c = Charset('fake')
- eq('hello w\xf6rld', c.body_encode('hello w\xf6rld'))
+ eq('hello world', c.body_encode('hello world'))
def test_unicode_charset_name(self):
charset = Charset('us-ascii')
@@ -3507,6 +4008,20 @@ A very long line that must get split to something other than at the
h.append(x, errors='replace')
eq(str(h), e)
+ def test_escaped_8bit_header(self):
+ x = b'Ynwp4dUEbay Auction Semiar- No Charge \x96 Earn Big'
+ x = x.decode('ascii', 'surrogateescape')
+ h = Header(x, charset=email.charset.UNKNOWN8BIT)
+ self.assertEqual(str(h),
+ 'Ynwp4dUEbay Auction Semiar- No Charge \uFFFD Earn Big')
+ self.assertEqual(email.header.decode_header(h), [(x, 'unknown-8bit')])
+
+ def test_modify_returned_list_does_not_change_header(self):
+ h = Header('test')
+ chunks = email.header.decode_header(h)
+ chunks.append(('ascii', 'test2'))
+ self.assertEqual(str(h), 'test')
+
def test_encoded_adjacent_nonencoded(self):
eq = self.assertEqual
h = Header()
@@ -3581,7 +4096,7 @@ To: bbb@zzz.org
Subject: This is a test message
Date: Fri, 4 May 2001 14:05:44 -0400
Content-Type: text/plain; charset=us-ascii;
- title*="us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20isn%27t%20it%21"
+ title*=us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20isn%27t%20it%21
Hi,
@@ -3611,7 +4126,7 @@ To: bbb@zzz.org
Subject: This is a test message
Date: Fri, 4 May 2001 14:05:44 -0400
Content-Type: text/plain; charset="us-ascii";
- title*="us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20isn%27t%20it%21"
+ title*=us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20isn%27t%20it%21
Hi,
@@ -3626,6 +4141,32 @@ Do you like this message?
msg = self._msgobj('msg_32.txt')
eq(msg.get_content_charset(), 'us-ascii')
+ def test_rfc2231_parse_rfc_quoting(self):
+ m = textwrap.dedent('''\
+ Content-Disposition: inline;
+ \tfilename*0*=''This%20is%20even%20more%20;
+ \tfilename*1*=%2A%2A%2Afun%2A%2A%2A%20;
+ \tfilename*2="is it not.pdf"
+
+ ''')
+ msg = email.message_from_string(m)
+ self.assertEqual(msg.get_filename(),
+ 'This is even more ***fun*** is it not.pdf')
+ self.assertEqual(m, msg.as_string())
+
+ def test_rfc2231_parse_extra_quoting(self):
+ m = textwrap.dedent('''\
+ Content-Disposition: inline;
+ \tfilename*0*="''This%20is%20even%20more%20";
+ \tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
+ \tfilename*2="is it not.pdf"
+
+ ''')
+ msg = email.message_from_string(m)
+ self.assertEqual(msg.get_filename(),
+ 'This is even more ***fun*** is it not.pdf')
+ self.assertEqual(m, msg.as_string())
+
def test_rfc2231_no_language_or_charset(self):
m = '''\
Content-Transfer-Encoding: 8bit
diff --git a/Lib/email/test/test_email_codecs.py b/Lib/email/test/test_email_codecs.py
index acc19c3..ca85f57 100644
--- a/Lib/email/test/test_email_codecs.py
+++ b/Lib/email/test/test_email_codecs.py
@@ -13,7 +13,7 @@ from email.message import Message
# We're compatible with Python 2.3, but it doesn't have the built-in Asian
# codecs, so we have to skip all these tests.
try:
- str('foo', 'euc-jp')
+ str(b'foo', 'euc-jp')
except LookupError:
raise unittest.SkipTest
@@ -22,11 +22,14 @@ except LookupError:
class TestEmailAsianCodecs(TestEmailBase):
def test_japanese_codecs(self):
eq = self.ndiffAssertEqual
- j = Charset("euc-jp")
- g = Charset("iso-8859-1")
+ jcode = "euc-jp"
+ gcode = "iso-8859-1"
+ j = Charset(jcode)
+ g = Charset(gcode)
h = Header("Hello World!")
- jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa'
- ghello = 'Gr\xfc\xdf Gott!'
+ jhello = str(b'\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc'
+ b'\xa5\xeb\xa5\xc9\xa1\xaa', jcode)
+ ghello = str(b'Gr\xfc\xdf Gott!', gcode)
h.append(jhello, j)
h.append(ghello, g)
# BAW: This used to -- and maybe should -- fold the two iso-8859-1
@@ -36,13 +39,17 @@ class TestEmailAsianCodecs(TestEmailBase):
# encoded word.
eq(h.encode(), """\
Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?=
- =?iso-8859-1?q?Gr=FC=DF?= =?iso-8859-1?q?_Gott!?=""")
+ =?iso-8859-1?q?Gr=FC=DF_Gott!?=""")
eq(decode_header(h.encode()),
- [('Hello World!', None),
- ('\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'),
- ('Gr\xfc\xdf Gott!', 'iso-8859-1')])
- int = 'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5\xa4\xec\xa4\xbf\xa5\xe1\xa1\xbc\xa5\xeb\xa4\xcf\xbb\xca\xb2\xf1\xbc\xd4\xa4\xce\xbe\xb5\xc7\xa7\xa4\xf2\xc2\xd4\xa4\xc3\xa4\xc6\xa4\xa4\xa4\xde\xa4\xb9'
- h = Header(int, j, header_name="Subject")
+ [(b'Hello World!', None),
+ (b'\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'),
+ (b'Gr\xfc\xdf Gott!', gcode)])
+ subject_bytes = (b'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5'
+ b'\xa4\xec\xa4\xbf\xa5\xe1\xa1\xbc\xa5\xeb\xa4\xcf\xbb\xca\xb2'
+ b'\xf1\xbc\xd4\xa4\xce\xbe\xb5\xc7\xa7\xa4\xf2\xc2\xd4\xa4\xc3'
+ b'\xa4\xc6\xa4\xa4\xa4\xde\xa4\xb9')
+ subject = str(subject_bytes, jcode)
+ h = Header(subject, j, header_name="Subject")
# test a very long header
enc = h.encode()
# TK: splitting point may differ by codec design and/or Header encoding
@@ -50,15 +57,24 @@ Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?=
=?iso-2022-jp?b?dGVzdC1qYSAbJEIkWEVqOUYkNSRsJD8lYSE8JWskTztKGyhC?=
=?iso-2022-jp?b?GyRCMnE8VCROPjVHJyRyQlQkQyRGJCQkXiQ5GyhC?=""")
# TK: full decode comparison
- eq(h.__unicode__().encode('euc-jp'), int)
+ eq(str(h).encode(jcode), subject_bytes)
+
+ def test_payload_encoding_utf8(self):
+ jhello = str(b'\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc'
+ b'\xa5\xeb\xa5\xc9\xa1\xaa', 'euc-jp')
+ msg = Message()
+ msg.set_payload(jhello, 'utf-8')
+ ustr = msg.get_payload(decode=True).decode(msg.get_content_charset())
+ self.assertEqual(jhello, ustr)
def test_payload_encoding(self):
- jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa'
jcode = 'euc-jp'
+ jhello = str(b'\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc'
+ b'\xa5\xeb\xa5\xc9\xa1\xaa', jcode)
msg = Message()
msg.set_payload(jhello, jcode)
- ustr = str(msg.get_payload(), msg.get_content_charset())
- self.assertEqual(jhello, ustr.encode(jcode))
+ ustr = msg.get_payload(decode=True).decode(msg.get_content_charset())
+ self.assertEqual(jhello, ustr)
diff --git a/Lib/email/test/test_email_torture.py b/Lib/email/test/test_email_torture.py
index 57233bf..544b1bb 100644
--- a/Lib/email/test/test_email_torture.py
+++ b/Lib/email/test/test_email_torture.py
@@ -13,11 +13,11 @@ from io import StringIO
from types import ListType
from email.test.test_email import TestEmailBase
-from test.support import TestSkipped
+from test.support import TestSkipped, run_unittest
import email
from email import __file__ as testfile
-from email.Iterators import _structure
+from email.iterators import _structure
def openfile(filename):
from os.path import join, dirname, abspath
@@ -128,7 +128,7 @@ def suite():
def test_main():
for testclass in _testclasses():
- support.run_unittest(testclass)
+ run_unittest(testclass)
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index 5f40bac..ac4da37 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -148,13 +148,15 @@ def formatdate(timeval=None, localtime=False, usegmt=False):
-def make_msgid(idstring=None):
+def make_msgid(idstring=None, domain=None):
"""Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
<20020201195627.33539.96671@nightshade.la.mastaler.com>
Optional idstring if given is a string used to strengthen the
- uniqueness of the message id.
+ uniqueness of the message id. Optional domain if given provides the
+ portion of the message id after the '@'. It defaults to the locally
+ defined hostname.
"""
timeval = time.time()
utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
@@ -164,8 +166,9 @@ def make_msgid(idstring=None):
idstring = ''
else:
idstring = '.' + idstring
- idhost = socket.getfqdn()
- msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
+ if domain is None:
+ domain = socket.getfqdn()
+ msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
return msgid