From caf2f5184b2a132f6ffb0514b13a1295f2df72c5 Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Fri, 21 Mar 2003 21:09:32 +0000 Subject: Backporting email 2.5 to Python 2.2 maint branch. --- Lib/email/Charset.py | 45 +++- Lib/email/Generator.py | 70 +++-- Lib/email/Header.py | 320 ++++++++++++++-------- Lib/email/MIMEText.py | 5 +- Lib/email/Message.py | 44 ++- Lib/email/Parser.py | 43 ++- Lib/email/Utils.py | 28 +- Lib/email/__init__.py | 2 +- Lib/email/_compat21.py | 14 +- Lib/email/_compat22.py | 9 +- Lib/email/base64MIME.py | 3 - Lib/email/quopriMIME.py | 13 +- Lib/email/test/data/msg_21.txt | 2 - Lib/email/test/test_email.py | 518 +++++++++++++++++++++++++++++------- Lib/email/test/test_email_codecs.py | 32 +-- 15 files changed, 819 insertions(+), 329 deletions(-) diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py index b852245..dd328e0 100644 --- a/Lib/email/Charset.py +++ b/Lib/email/Charset.py @@ -35,6 +35,20 @@ CHARSETS = { # input header enc body enc output conv 'iso-8859-1': (QP, QP, None), 'iso-8859-2': (QP, QP, None), + 'iso-8859-3': (QP, QP, None), + 'iso-8859-4': (QP, QP, None), + # iso-8859-5 is Cyrillic, and not especially used + # iso-8859-6 is Arabic, also not particularly used + # iso-8859-7 is Greek, QP will not make it readable + # iso-8859-8 is Hebrew, QP will not make it readable + 'iso-8859-9': (QP, QP, None), + 'iso-8859-10': (QP, QP, None), + # iso-8859-11 is Thai, QP will not make it readable + 'iso-8859-13': (QP, QP, None), + 'iso-8859-14': (QP, QP, None), + 'iso-8859-15': (QP, QP, None), + 'windows-1252':(QP, QP, None), + 'viscii': (QP, QP, None), 'us-ascii': (None, None, None), 'big5': (BASE64, BASE64, None), 'gb2312': (BASE64, BASE64, None), @@ -52,6 +66,25 @@ CHARSETS = { ALIASES = { 'latin_1': 'iso-8859-1', 'latin-1': 'iso-8859-1', + 'latin_2': 'iso-8859-2', + 'latin-2': 'iso-8859-2', + 'latin_3': 'iso-8859-3', + 'latin-3': 'iso-8859-3', + 'latin_4': 'iso-8859-4', + 'latin-4': 'iso-8859-4', + 'latin_5': 'iso-8859-9', + 'latin-5': 'iso-8859-9', + 'latin_6': 'iso-8859-10', + 'latin-6': 'iso-8859-10', + 'latin_7': 'iso-8859-13', + 'latin-7': 'iso-8859-13', + 'latin_8': 'iso-8859-14', + 'latin-8': 'iso-8859-14', + 'latin_9': 'iso-8859-15', + 'latin-9': 'iso-8859-15', + 'cp949': 'ks_c_5601-1987', + 'euc_jp': 'euc-jp', + 'euc_kr': 'euc-kr', 'ascii': 'us-ascii', } @@ -69,6 +102,10 @@ CODEC_MAP = { 'euc-jp': 'japanese.euc-jp', 'iso-2022-jp': 'japanese.iso-2022-jp', 'shift_jis': 'japanese.shift_jis', + 'euc-kr': 'korean.euc-kr', + 'ks_c_5601-1987': 'korean.cp949', + 'iso-2022-kr': 'korean.iso-2022-kr', + 'johab': 'korean.johab', 'gb2132': 'eucgb2312_cn', 'big5': 'big5_tw', 'utf-8': 'utf-8', @@ -197,6 +234,8 @@ class Charset: def __str__(self): return self.input_charset.lower() + __repr__ = __str__ + def __eq__(self, other): return str(self) == str(other).lower() @@ -321,14 +360,14 @@ class Charset: if self.header_encoding == BASE64: return email.base64MIME.header_encode(s, cset) elif self.header_encoding == QP: - return email.quopriMIME.header_encode(s, cset) + return email.quopriMIME.header_encode(s, cset, maxlinelen=None) elif self.header_encoding == SHORTEST: lenb64 = email.base64MIME.base64_len(s) lenqp = email.quopriMIME.header_quopri_len(s) if lenb64 < lenqp: return email.base64MIME.header_encode(s, cset) else: - return email.quopriMIME.header_encode(s, cset) + return email.quopriMIME.header_encode(s, cset, maxlinelen=None) else: return s @@ -348,7 +387,7 @@ class Charset: # 7bit/8bit encodings return the string unchanged (module conversions) if self.body_encoding is BASE64: return email.base64MIME.body_encode(s) - elif self.header_encoding is QP: + elif self.body_encoding is QP: return email.quopriMIME.body_encode(s) else: return s diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py index 58e2f91..9cce51c 100644 --- a/Lib/email/Generator.py +++ b/Lib/email/Generator.py @@ -4,14 +4,16 @@ """Classes to generate plain text from a message object tree. """ -import time import re +import time +import locale import random from types import ListType, StringType from cStringIO import StringIO from email.Header import Header +from email.Parser import NLCRE try: from email._compat22 import _isstring @@ -159,44 +161,29 @@ class Generator: def _write_headers(self, msg): for h, v in msg.items(): - # RFC 2822 says that lines SHOULD be no more than maxheaderlen - # characters wide, so we're well within our rights to split long - # headers. - text = '%s: %s' % (h, v) - if self.__maxheaderlen > 0 and len(text) > self.__maxheaderlen: - text = self._split_header(text) - print >> self._fp, text + print >> self._fp, '%s:' % h, + if self.__maxheaderlen == 0: + # Explicit no-wrapping + print >> self._fp, v + elif isinstance(v, Header): + # Header instances know what to do + print >> self._fp, v.encode() + elif _is8bitstring(v): + # If we have raw 8bit data in a byte string, we have no idea + # what the encoding is. There is no safe way to split this + # string. If it's ascii-subset, then we could do a normal + # ascii split, but if it's multibyte then we could break the + # string. There's no way to know so the least harm seems to + # be to not split the string and risk it being too long. + print >> self._fp, v + else: + # Header's got lots of smarts, so use it. + print >> self._fp, Header( + v, maxlinelen=self.__maxheaderlen, + header_name=h, continuation_ws='\t').encode() # A blank line always separates headers from body print >> self._fp - def _split_header(self, text): - maxheaderlen = self.__maxheaderlen - # Find out whether any lines in the header are really longer than - # maxheaderlen characters wide. There could be continuation lines - # that actually shorten it. Also, replace hard tabs with 8 spaces. - lines = [s.replace('\t', SPACE8) for s in text.splitlines()] - for line in lines: - if len(line) > maxheaderlen: - break - else: - # No line was actually longer than maxheaderlen characters, so - # just return the original unchanged. - return text - # If we have raw 8bit data in a byte string, we have no idea what the - # encoding is. I think there is no safe way to split this string. If - # it's ascii-subset, then we could do a normal ascii split, but if - # it's multibyte then we could break the string. There's no way to - # know so the least harm seems to be to not split the string and risk - # it being too long. - if _is8bitstring(text): - return text - # The `text' argument already has the field name prepended, so don't - # provide it here or the first line will get folded too short. - h = Header(text, maxlinelen=maxheaderlen, - # For backwards compatibility, we use a hard tab here - continuation_ws='\t') - return h.encode() - # # Handlers for writing types and subtypes # @@ -258,6 +245,14 @@ class Generator: # Write out any preamble if msg.preamble is not None: self._fp.write(msg.preamble) + # If preamble is the empty string, the length of the split will be + # 1, but the last element will be the empty string. If it's + # anything else but does not end in a line separator, the length + # will be > 1 and not end in an empty string. We need to + # guarantee a newline after the preamble, but don't add too many. + plines = NLCRE.split(msg.preamble) + if plines <> [''] and plines[-1] <> '': + self._fp.write('\n') # First boundary is a bit different; it doesn't have a leading extra # newline. print >> self._fp, '--' + boundary @@ -364,7 +359,8 @@ class DecodedGenerator(Generator): def _make_boundary(text=None): # Craft a random boundary. If text is given, ensure that the chosen # boundary doesn't appear in the text. - boundary = ('=' * 15) + repr(random.random()).split('.')[1] + '==' + dp = locale.localeconv().get('decimal_point', '.') + boundary = ('=' * 15) + repr(random.random()).split(dp)[1] + '==' if text is None: return boundary b = boundary diff --git a/Lib/email/Header.py b/Lib/email/Header.py index 0ceacc7..624e7c4 100644 --- a/Lib/email/Header.py +++ b/Lib/email/Header.py @@ -4,10 +4,12 @@ """Header encoding and decoding functionality.""" import re +import binascii from types import StringType, UnicodeType import email.quopriMIME import email.base64MIME +from email.Errors import HeaderParseError from email.Charset import Charset try: @@ -25,8 +27,11 @@ except NameError: CRLFSPACE = '\r\n ' CRLF = '\r\n' NL = '\n' +SPACE = ' ' +USPACE = u' ' SPACE8 = ' ' * 8 EMPTYSTRING = '' +UEMPTYSTRING = u'' MAXLINELEN = 76 @@ -47,6 +52,13 @@ ecre = re.compile(r''' \?= # literal ?= ''', re.VERBOSE | re.IGNORECASE) +pcre = re.compile('([,;])') + +# Field name regexp, including trailing colon, but not separating whitespace, +# according to RFC 2822. Character range is from tilde to exclamation mark. +# For use with .match() +fcre = re.compile(r'[\041-\176]+:$') + # Helpers @@ -61,6 +73,9 @@ def decode_header(header): decoded parts of the header. Charset is None for non-encoded parts of the header, otherwise a lower-case string containing the name of the character set specified in the encoded string. + + An email.Errors.HeaderParseError may be raised when certain decoding error + occurs (e.g. a base64 decoding exception). """ # If no encoding, just return the header header = str(header) @@ -79,18 +94,24 @@ def decode_header(header): if unenc: # Should we continue a long line? if decoded and decoded[-1][1] is None: - decoded[-1] = (decoded[-1][0] + dec, None) + decoded[-1] = (decoded[-1][0] + SPACE + unenc, None) else: decoded.append((unenc, None)) if parts: charset, encoding = [s.lower() for s in parts[0:2]] encoded = parts[2] - dec = '' + dec = None if encoding == 'q': dec = email.quopriMIME.header_decode(encoded) elif encoding == 'b': - dec = email.base64MIME.decode(encoded) - else: + try: + dec = email.base64MIME.decode(encoded) + except binascii.Error: + # Turn this into a higher level exception. BAW: Right + # now we throw the lower level exception away but + # when/if we get exception chaining, we'll preserve it. + raise HeaderParseError + if dec is None: dec = encoded if decoded and decoded[-1][1] == charset: @@ -126,8 +147,9 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None, class Header: - def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None, - continuation_ws=' '): + def __init__(self, s=None, charset=None, + maxlinelen=None, header_name=None, + continuation_ws=' ', errors='strict'): """Create a MIME-compliant header that can contain many character sets. Optional s is the initial header value. If None, the initial header @@ -150,6 +172,8 @@ class Header: continuation_ws must be RFC 2822 compliant folding whitespace (usually either a space or a hard tab) which will be prepended to continuation lines. + + errors is passed through to the .append() call. """ if charset is None: charset = USASCII @@ -161,7 +185,7 @@ class Header: # BAW: I believe `chunks' and `maxlinelen' should be non-public. self._chunks = [] if s is not None: - self.append(s, charset) + self.append(s, charset, errors) if maxlinelen is None: maxlinelen = MAXLINELEN if header_name is None: @@ -182,9 +206,24 @@ class Header: def __unicode__(self): """Helper for the built-in unicode function.""" - # charset item is a Charset instance so we need to stringify it. - uchunks = [unicode(s, str(charset)) for s, charset in self._chunks] - return u''.join(uchunks) + uchunks = [] + lastcs = None + for s, charset in self._chunks: + # We must preserve spaces between encoded and non-encoded word + # boundaries, which means for us we need to add a space when we go + # from a charset to None/us-ascii, or from None/us-ascii to a + # charset. Only do this for the second and subsequent chunks. + nextcs = charset + if uchunks: + if lastcs is not None: + if nextcs is None or nextcs == 'us-ascii': + uchunks.append(USPACE) + nextcs = None + elif nextcs is not None and nextcs <> 'us-ascii': + uchunks.append(USPACE) + lastcs = nextcs + uchunks.append(unicode(s, str(charset))) + return UEMPTYSTRING.join(uchunks) # Rich comparison operators for equality only. BAW: does it make sense to # have or explicitly disable <, <=, >, >= operators? @@ -196,7 +235,7 @@ class Header: def __ne__(self, other): return not self == other - def append(self, s, charset=None): + def append(self, s, charset=None, errors='strict'): """Append a string to the MIME header. Optional charset, if given, should be a Charset instance or the name @@ -213,6 +252,9 @@ class Header: using RFC 2047 rules, the Unicode string will be encoded using the following charsets in order: us-ascii, the charset hint, utf-8. The first character set not to provoke a UnicodeError is used. + + Optional `errors' is passed as the third argument to any unicode() or + ustr.encode() call. """ if charset is None: charset = self._charset @@ -227,12 +269,12 @@ class Header: # Possibly raise UnicodeError if the byte string can't be # converted to a unicode with the input codec of the charset. incodec = charset.input_codec or 'us-ascii' - ustr = unicode(s, incodec) + ustr = unicode(s, incodec, errors) # Now make sure that the unicode could be converted back to a # byte string with the output codec, which may be different # than the iput coded. Still, use the original byte string. outcodec = charset.output_codec or 'us-ascii' - ustr.encode(outcodec) + ustr.encode(outcodec, errors) elif isinstance(s, UnicodeType): # Now we have to be sure the unicode string can be converted # to a byte string with a reasonable output codec. We want to @@ -240,7 +282,7 @@ class Header: for charset in USASCII, charset, UTF8: try: outcodec = charset.output_codec or 'us-ascii' - s = s.encode(outcodec) + s = s.encode(outcodec, errors) break except UnicodeError: pass @@ -248,13 +290,13 @@ class Header: assert False, 'utf-8 conversion failed' self._chunks.append((s, charset)) - def _split(self, s, charset, firstline=False): + def _split(self, s, charset, maxlinelen, splitchars): # Split up a header safely for use with encode_chunks. splittable = charset.to_splittable(s) - encoded = charset.from_splittable(splittable) + encoded = charset.from_splittable(splittable, True) elen = charset.encoded_header_len(encoded) - - if elen <= self._maxlinelen: + # If the line's encoded length first, just return it + if elen <= maxlinelen: return [(encoded, charset)] # If we have undetermined raw 8bit characters sitting in a byte # string, we really don't know what the right thing to do is. We @@ -262,7 +304,7 @@ class Header: # could break if we split it between pairs. The least harm seems to # be to not split the header at all, but that means they could go out # longer than maxlinelen. - elif charset == '8bit': + if charset == '8bit': return [(s, charset)] # BAW: I'm not sure what the right test here is. What we're trying to # do is be faithful to RFC 2822's recommendation that ($2.2.3): @@ -275,101 +317,31 @@ class Header: # For now, I can only imagine doing this when the charset is us-ascii, # although it's possible that other charsets may also benefit from the # higher-level syntactic breaks. - # elif charset == 'us-ascii': - return self._ascii_split(s, charset, firstline) + return self._split_ascii(s, charset, maxlinelen, splitchars) # BAW: should we use encoded? elif elen == len(s): # We can split on _maxlinelen boundaries because we know that the # encoding won't change the size of the string - splitpnt = self._maxlinelen + splitpnt = maxlinelen first = charset.from_splittable(splittable[:splitpnt], False) last = charset.from_splittable(splittable[splitpnt:], False) else: - # Divide and conquer. - halfway = _floordiv(len(splittable), 2) - first = charset.from_splittable(splittable[:halfway], False) - last = charset.from_splittable(splittable[halfway:], False) - # Do the split - return self._split(first, charset, firstline) + \ - self._split(last, charset) - - def _ascii_split(self, s, charset, firstline): - # Attempt to split the line at the highest-level syntactic break - # possible. Note that we don't have a lot of smarts about field - # syntax; we just try to break on semi-colons, then whitespace. - rtn = [] - lines = s.splitlines() - while lines: - line = lines.pop(0) - if firstline: - maxlinelen = self._firstlinelen - firstline = False - else: - #line = line.lstrip() - maxlinelen = self._maxlinelen - # Short lines can remain unchanged - if len(line.replace('\t', SPACE8)) <= maxlinelen: - rtn.append(line) - else: - oldlen = len(line) - # Try to break the line on semicolons, but if that doesn't - # work, try to split on folding whitespace. - while len(line) > maxlinelen: - i = line.rfind(';', 0, maxlinelen) - if i < 0: - break - rtn.append(line[:i] + ';') - line = line[i+1:] - # Is the remaining stuff still longer than maxlinelen? - if len(line) <= maxlinelen: - # Splitting on semis worked - rtn.append(line) - continue - # Splitting on semis didn't finish the job. If it did any - # work at all, stick the remaining junk on the front of the - # `lines' sequence and let the next pass do its thing. - if len(line) <> oldlen: - lines.insert(0, line) - continue - # Otherwise, splitting on semis didn't help at all. - parts = re.split(r'(\s+)', line) - if len(parts) == 1 or (len(parts) == 3 and - parts[0].endswith(':')): - # This line can't be split on whitespace. There's now - # little we can do to get this into maxlinelen. BAW: - # We're still potentially breaking the RFC by possibly - # allowing lines longer than the absolute maximum of 998 - # characters. For now, let it slide. - # - # len(parts) will be 1 if this line has no `Field: ' - # prefix, otherwise it will be len(3). - rtn.append(line) - continue - # There is whitespace we can split on. - first = parts.pop(0) - sublines = [first] - acc = len(first) - while parts: - len0 = len(parts[0]) - len1 = len(parts[1]) - if acc + len0 + len1 <= maxlinelen: - sublines.append(parts.pop(0)) - sublines.append(parts.pop(0)) - acc += len0 + len1 - else: - # Split it here, but don't forget to ignore the - # next whitespace-only part - if first <> '': - rtn.append(EMPTYSTRING.join(sublines)) - del parts[0] - first = parts.pop(0) - sublines = [first] - acc = len(first) - rtn.append(EMPTYSTRING.join(sublines)) - return [(chunk, charset) for chunk in rtn] - - def _encode_chunks(self, newchunks): + # Binary search for split point + first, last = _binsplit(splittable, charset, maxlinelen) + # first is of the proper length so just wrap it in the appropriate + # chrome. last must be recursively split. + fsplittable = charset.to_splittable(first) + fencoded = charset.from_splittable(fsplittable, True) + chunk = [(fencoded, charset)] + return chunk + self._split(last, charset, self._maxlinelen, splitchars) + + def _split_ascii(self, s, charset, firstlen, splitchars): + chunks = _split_ascii(s, firstlen, self._maxlinelen, + self._continuation_ws, splitchars) + return zip(chunks, [charset]*len(chunks)) + + def _encode_chunks(self, newchunks, maxlinelen): # MIME-encode a header with many different charsets and/or encodings. # # Given a list of pairs (string, charset), return a MIME-encoded @@ -387,19 +359,24 @@ class Header: # # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n # =?charset2?b?SvxyZ2VuIEL2aW5n?=" - # chunks = [] for header, charset in newchunks: + if not header: + continue if charset is None or charset.header_encoding is None: - # There's no encoding for this chunk's charsets - _max_append(chunks, header, self._maxlinelen) + s = header + else: + s = charset.header_encode(header) + # Don't add more folding whitespace than necessary + if chunks and chunks[-1].endswith(' '): + extra = '' else: - _max_append(chunks, charset.header_encode(header), - self._maxlinelen, ' ') + extra = ' ' + _max_append(chunks, s, maxlinelen, extra) joiner = NL + self._continuation_ws return joiner.join(chunks) - def encode(self): + def encode(self, splitchars=';, '): """Encode a message header into an RFC-compliant format. There are many issues involved in converting a given string for use in @@ -416,8 +393,123 @@ class Header: If the given charset is not known or an error occurs during conversion, this function will return the header untouched. + + Optional splitchars is a string containing characters to split long + ASCII lines on, in rough support of RFC 2822's `highest level + syntactic breaks'. This doesn't affect RFC 2047 encoded lines. """ newchunks = [] + maxlinelen = self._firstlinelen + lastlen = 0 for s, charset in self._chunks: - newchunks += self._split(s, charset, True) - return self._encode_chunks(newchunks) + # The first bit of the next chunk should be just long enough to + # fill the next line. Don't forget the space separating the + # encoded words. + targetlen = maxlinelen - lastlen - 1 + if targetlen < charset.encoded_header_len(''): + # Stick it on the next line + targetlen = maxlinelen + newchunks += self._split(s, charset, targetlen, splitchars) + lastchunk, lastcharset = newchunks[-1] + lastlen = lastcharset.encoded_header_len(lastchunk) + return self._encode_chunks(newchunks, maxlinelen) + + + +def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars): + lines = [] + maxlen = firstlen + for line in s.splitlines(): + # Ignore any leading whitespace (i.e. continuation whitespace) already + # on the line, since we'll be adding our own. + line = line.lstrip() + if len(line) < maxlen: + lines.append(line) + maxlen = restlen + continue + # Attempt to split the line at the highest-level syntactic break + # possible. Note that we don't have a lot of smarts about field + # syntax; we just try to break on semi-colons, then commas, then + # whitespace. + for ch in splitchars: + if line.find(ch) >= 0: + break + else: + # There's nothing useful to split the line on, not even spaces, so + # just append this line unchanged + lines.append(line) + maxlen = restlen + continue + # Now split the line on the character plus trailing whitespace + cre = re.compile(r'%s\s*' % ch) + if ch in ';,': + eol = ch + else: + eol = '' + joiner = eol + ' ' + joinlen = len(joiner) + wslen = len(continuation_ws.replace('\t', SPACE8)) + this = [] + linelen = 0 + for part in cre.split(line): + curlen = linelen + max(0, len(this)-1) * joinlen + partlen = len(part) + onfirstline = not lines + # We don't want to split after the field name, if we're on the + # first line and the field name is present in the header string. + if ch == ' ' and onfirstline and \ + len(this) == 1 and fcre.match(this[0]): + this.append(part) + linelen += partlen + elif curlen + partlen > maxlen: + if this: + lines.append(joiner.join(this) + eol) + # If this part is longer than maxlen and we aren't already + # splitting on whitespace, try to recursively split this line + # on whitespace. + if partlen > maxlen and ch <> ' ': + subl = _split_ascii(part, maxlen, restlen, + continuation_ws, ' ') + lines.extend(subl[:-1]) + this = [subl[-1]] + else: + this = [part] + linelen = wslen + len(this[-1]) + maxlen = restlen + else: + this.append(part) + linelen += partlen + # Put any left over parts on a line by themselves + if this: + lines.append(joiner.join(this)) + return lines + + + +def _binsplit(splittable, charset, maxlinelen): + i = 0 + j = len(splittable) + while i < j: + # Invariants: + # 1. splittable[:k] fits for all k <= i (note that we *assume*, + # at the start, that splittable[:0] fits). + # 2. splittable[:k] does not fit for any k > j (at the start, + # this means we shouldn't look at any k > len(splittable)). + # 3. We don't know about splittable[:k] for k in i+1..j. + # 4. We want to set i to the largest k that fits, with i <= k <= j. + # + m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j + chunk = charset.from_splittable(splittable[:m], True) + chunklen = charset.encoded_header_len(chunk) + if chunklen <= maxlinelen: + # m is acceptable, so is a new lower bound. + i = m + else: + # m is not acceptable, so final i must be < m. + j = m - 1 + # i == j. Invariant #1 implies that splittable[:i] fits, and + # invariant #2 implies that splittable[:i+1] does not fit, so i + # is what we're looking for. + first = charset.from_splittable(splittable[:i], False) + last = charset.from_splittable(splittable[i:], False) + return first, last diff --git a/Lib/email/MIMEText.py b/Lib/email/MIMEText.py index d91b93d..d049ad9 100644 --- a/Lib/email/MIMEText.py +++ b/Lib/email/MIMEText.py @@ -17,8 +17,7 @@ class MIMEText(MIMENonMultipart): _encoder=None): """Create a text/* type MIME document. - _text is the string for this message object. If the text does not end - in a newline, one is added. + _text is the string for this message object. _subtype is the MIME sub content type, defaulting to "plain". @@ -35,8 +34,6 @@ class MIMEText(MIMENonMultipart): """ MIMENonMultipart.__init__(self, 'text', _subtype, **{'charset': _charset}) - if _text and not _text.endswith('\n'): - _text += '\n' self.set_payload(_text, _charset) if _encoder is not None: warnings.warn('_encoder argument is obsolete.', diff --git a/Lib/email/Message.py b/Lib/email/Message.py index 16ae120..66f8640 100644 --- a/Lib/email/Message.py +++ b/Lib/email/Message.py @@ -5,13 +5,15 @@ """ import re +import uu +import binascii import warnings from cStringIO import StringIO from types import ListType, TupleType, StringType # Intrapackage imports -from email import Errors from email import Utils +from email import Errors from email import Charset SEMISPACE = '; ' @@ -164,14 +166,18 @@ class Message: the list object, you modify the message's payload in place. Optional i returns that index into the payload. - Optional decode is a flag (defaulting to False) indicating whether the - payload should be decoded or not, according to the - Content-Transfer-Encoding header. When True and the message is not a - multipart, the payload will be decoded if this header's value is - `quoted-printable' or `base64'. If some other encoding is used, or - the header is missing, the payload is returned as-is (undecoded). If - the message is a multipart and the decode flag is True, then None is - returned. + Optional decode is a flag indicating whether the payload should be + decoded or not, according to the Content-Transfer-Encoding header + (default is False). + + When True and the message is not a multipart, the payload will be + decoded if this header's value is `quoted-printable' or `base64'. If + some other encoding is used, or the header is missing, or if the + payload has bogus data (i.e. bogus base64 or uuencoded data), the + payload is returned as-is. + + If the message is a multipart and the decode flag is True, then None + is returned. """ if i is None: payload = self._payload @@ -182,11 +188,23 @@ class Message: if decode: if self.is_multipart(): return None - cte = self.get('content-transfer-encoding', '') - if cte.lower() == 'quoted-printable': + cte = self.get('content-transfer-encoding', '').lower() + if cte == 'quoted-printable': return Utils._qdecode(payload) - elif cte.lower() == 'base64': - return Utils._bdecode(payload) + elif cte == 'base64': + try: + return Utils._bdecode(payload) + except binascii.Error: + # Incorrect padding + return payload + elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): + sfp = StringIO() + try: + uu.decode(StringIO(payload+'\n'), sfp) + payload = sfp.getvalue() + except uu.Error: + # Some decoding problem + return payload # Everything else, including encodings with 8bit or 7bit are returned # unchanged. return payload diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py index 5fea3c3..09fac45 100644 --- a/Lib/email/Parser.py +++ b/Lib/email/Parser.py @@ -20,7 +20,7 @@ except NameError: True = 1 False = 0 -nlcre = re.compile('\r\n|\r|\n') +NLCRE = re.compile('\r\n|\r|\n') @@ -59,9 +59,9 @@ class Parser: meaning it parses the entire contents of the file. """ root = self._class() - self._parseheaders(root, fp) + firstbodyline = self._parseheaders(root, fp) if not headersonly: - self._parsebody(root, fp) + self._parsebody(root, fp, firstbodyline) return root def parsestr(self, text, headersonly=False): @@ -80,6 +80,7 @@ class Parser: lastheader = '' lastvalue = [] lineno = 0 + firstbodyline = None while True: # Don't strip the line before we test for the end condition, # because whitespace-only header lines are RFC compliant @@ -120,13 +121,16 @@ class Parser: if i < 0: if self._strict: raise Errors.HeaderParseError( - "Not a header, not a continuation: ``%s''"%line) + "Not a header, not a continuation: ``%s''" % line) elif lineno == 1 and line.startswith('--'): # allow through duplicate boundary tags. continue else: - raise Errors.HeaderParseError( - "Not a header, not a continuation: ``%s''"%line) + # There was no separating blank line as mandated by RFC + # 2822, but we're in non-strict mode. So just offer up + # this current line as the first body line. + firstbodyline = line + break if lastheader: container[lastheader] = NL.join(lastvalue) lastheader = line[:i] @@ -134,8 +138,9 @@ class Parser: # Make sure we retain the last header if lastheader: container[lastheader] = NL.join(lastvalue) + return firstbodyline - def _parsebody(self, container, fp): + def _parsebody(self, container, fp, firstbodyline=None): # Parse the body, but first split the payload on the content-type # boundary if present. boundary = container.get_boundary() @@ -152,6 +157,8 @@ class Parser: # boundary. separator = '--' + boundary payload = fp.read() + if firstbodyline is not None: + payload = firstbodyline + '\n' + payload # We use an RE here because boundaries can have trailing # whitespace. mo = re.search( @@ -169,7 +176,7 @@ class Parser: preamble = payload[0:start] # Find out what kind of line endings we're using start += len(mo.group('sep')) + len(mo.group('ws')) - mo = nlcre.search(payload, start) + mo = NLCRE.search(payload, start) if mo: start += len(mo.group(0)) # We create a compiled regexp first because we need to be able to @@ -221,9 +228,13 @@ class Parser: # msgobj in this case is the "message/rfc822" container msgobj = self.parsestr(parthdrs, headersonly=1) # while submsgobj is the message itself - submsgobj = self.parsestr(part) - msgobj.attach(submsgobj) msgobj.set_default_type('message/rfc822') + maintype = msgobj.get_content_maintype() + if maintype in ('message', 'multipart'): + submsgobj = self.parsestr(part) + msgobj.attach(submsgobj) + else: + msgobj.set_payload(part) else: msgobj = self.parsestr(part) container.preamble = preamble @@ -256,7 +267,10 @@ class Parser: self._parsebody(msg, fp) container.attach(msg) else: - container.set_payload(fp.read()) + text = fp.read() + if firstbodyline is not None: + text = firstbodyline + '\n' + text + container.set_payload(text) @@ -270,6 +284,9 @@ class HeaderParser(Parser): Parsing with this subclass can be considerably faster if all you're interested in is the message headers. """ - def _parsebody(self, container, fp): + def _parsebody(self, container, fp, firstbodyline=None): # Consume but do not parse, the body - container.set_payload(fp.read()) + text = fp.read() + if firstbodyline is not None: + text = firstbodyline + '\n' + text + container.set_payload(text) diff --git a/Lib/email/Utils.py b/Lib/email/Utils.py index b619c6b..2b8b94f 100644 --- a/Lib/email/Utils.py +++ b/Lib/email/Utils.py @@ -13,13 +13,13 @@ import warnings from cStringIO import StringIO from types import ListType -from rfc822 import quote -from rfc822 import AddressList as _AddressList -from rfc822 import mktime_tz +from email._parseaddr import quote +from email._parseaddr import AddressList as _AddressList +from email._parseaddr import mktime_tz # We need wormarounds for bugs in these methods in older Pythons (see below) -from rfc822 import parsedate as _parsedate -from rfc822 import parsedate_tz as _parsedate_tz +from email._parseaddr import parsedate as _parsedate +from email._parseaddr import parsedate_tz as _parsedate_tz try: True, False @@ -54,8 +54,8 @@ EMPTYSTRING = '' UEMPTYSTRING = u'' CRLF = '\r\n' -specialsre = re.compile(r'[][\()<>@,:;".]') -escapesre = re.compile(r'[][\()"]') +specialsre = re.compile(r'[][\\()<>@,:;".]') +escapesre = re.compile(r'[][\\()"]') @@ -66,8 +66,6 @@ def _identity(s): def _bdecode(s): - if not s: - return s # We can't quite use base64.encodestring() since it tacks on a "courtesy # newline". Blech! if not s: @@ -280,9 +278,11 @@ def unquote(str): def decode_rfc2231(s): """Decode string according to RFC 2231""" import urllib - charset, language, s = s.split("'", 2) - s = urllib.unquote(s) - return charset, language, s + parts = s.split("'", 2) + if len(parts) == 1: + return None, None, s + charset, language, s = parts + return charset, language, urllib.unquote(s) def encode_rfc2231(s, charset=None, language=None): @@ -335,6 +335,6 @@ def decode_params(params): for num, continuation in continuations: value.append(continuation) charset, language, value = decode_rfc2231(EMPTYSTRING.join(value)) - new_params.append((name, - (charset, language, '"%s"' % quote(value)))) + new_params.append( + (name, (charset, language, '"%s"' % quote(value)))) return new_params diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py index b784da8..71b5b5d 100644 --- a/Lib/email/__init__.py +++ b/Lib/email/__init__.py @@ -4,7 +4,7 @@ """A package for parsing, handling, and generating email messages. """ -__version__ = '2.4.3' +__version__ = '2.5' __all__ = [ 'base64MIME', diff --git a/Lib/email/_compat21.py b/Lib/email/_compat21.py index de8c447..0e0b3d0 100644 --- a/Lib/email/_compat21.py +++ b/Lib/email/_compat21.py @@ -7,6 +7,9 @@ from cStringIO import StringIO from types import StringType, UnicodeType +False = 0 +True = 1 + # This function will become a method of the Message class @@ -31,17 +34,20 @@ def _floordiv(i, j): def _isstring(obj): - return isinstance(obj, StringType) or isinstance(obj, UnicodeType) + return isinstance(obj, StringType) or isinstance(obj, UnicodeType) # These two functions are imported into the Iterators.py interface module. # The Python 2.2 version uses generators for efficiency. -def body_line_iterator(msg): - """Iterate over the parts, returning string payloads line-by-line.""" +def body_line_iterator(msg, decode=False): + """Iterate over the parts, returning string payloads line-by-line. + + Optional decode (default False) is passed through to .get_payload(). + """ lines = [] for subpart in msg.walk(): - payload = subpart.get_payload() + payload = subpart.get_payload(decode=decode) if _isstring(payload): for line in StringIO(payload).readlines(): lines.append(line) diff --git a/Lib/email/_compat22.py b/Lib/email/_compat22.py index a05451f..ec2d2f8 100644 --- a/Lib/email/_compat22.py +++ b/Lib/email/_compat22.py @@ -38,10 +38,13 @@ def _isstring(obj): # These two functions are imported into the Iterators.py interface module. # The Python 2.2 version uses generators for efficiency. -def body_line_iterator(msg): - """Iterate over the parts, returning string payloads line-by-line.""" +def body_line_iterator(msg, decode=False): + """Iterate over the parts, returning string payloads line-by-line. + + Optional decode (default False) is passed through to .get_payload(). + """ for subpart in msg.walk(): - payload = subpart.get_payload() + payload = subpart.get_payload(decode=decode) if _isstring(payload): for line in StringIO(payload): yield line diff --git a/Lib/email/base64MIME.py b/Lib/email/base64MIME.py index 56e44e1..a247773 100644 --- a/Lib/email/base64MIME.py +++ b/Lib/email/base64MIME.py @@ -102,9 +102,6 @@ def header_encode(header, charset='iso-8859-1', keep_eols=False, max_encoded = maxlinelen - len(charset) - MISC_LEN max_unencoded = _floordiv(max_encoded * 3, 4) - # BAW: Ben's original code used a step of max_unencoded, but I think it - # ought to be max_encoded. Otherwise, where's max_encoded used? I'm - # still not sure what the for i in range(0, len(header), max_unencoded): base64ed.append(b2a_base64(header[i:i+max_unencoded])) diff --git a/Lib/email/quopriMIME.py b/Lib/email/quopriMIME.py index 18ddd89..67369b5 100644 --- a/Lib/email/quopriMIME.py +++ b/Lib/email/quopriMIME.py @@ -82,7 +82,7 @@ def body_quopri_len(str): def _max_append(L, s, maxlen, extra=''): if not L: L.append(s.lstrip()) - elif len(L[-1]) + len(s) < maxlen: + elif len(L[-1]) + len(s) <= maxlen: L[-1] += extra + s else: L.append(s.lstrip()) @@ -116,7 +116,8 @@ def header_encode(header, charset="iso-8859-1", keep_eols=False, =?charset?q?Silly_=C8nglish_Kn=EEghts?=" with each line wrapped safely at, at most, maxlinelen characters (defaults - to 76 characters). + to 76 characters). If maxlinelen is None, the entire string is encoded in + one chunk with no splitting. End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted to the canonical email line separator \\r\\n unless the keep_eols @@ -134,9 +135,13 @@ def header_encode(header, charset="iso-8859-1", keep_eols=False, header = fix_eols(header) # Quopri encode each line, in encoded chunks no greater than maxlinelen in - # lenght, after the RFC chrome is added in. + # length, after the RFC chrome is added in. quoted = [] - max_encoded = maxlinelen - len(charset) - MISC_LEN + if maxlinelen is None: + # An obnoxiously large number that's good enough + max_encoded = 100000 + else: + max_encoded = maxlinelen - len(charset) - MISC_LEN - 1 for c in header: # Space may be represented as _ instead of =20 for readability diff --git a/Lib/email/test/data/msg_21.txt b/Lib/email/test/data/msg_21.txt index 5b2e777..23590b2 100644 --- a/Lib/email/test/data/msg_21.txt +++ b/Lib/email/test/data/msg_21.txt @@ -10,13 +10,11 @@ MIME-Version: 1.0 Content-Transfer-Encoding: 7bit One - --BOUNDARY Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Two - --BOUNDARY-- End of MIME message diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index 5fca8a2..280b400 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -1,15 +1,15 @@ -# Copyright (C) 2001,2002 Python Software Foundation +# Copyright (C) 2001,2002,2003 Python Software Foundation # email package unit tests -import sys import os +import sys import time -import unittest import base64 import difflib +import unittest +import warnings from cStringIO import StringIO from types import StringType, ListType -import warnings import email @@ -42,11 +42,17 @@ SPACE = ' ' # We don't care about DeprecationWarnings warnings.filterwarnings('ignore', '', DeprecationWarning, __name__) +try: + True, False +except NameError: + True = 1 + False = 0 + -def openfile(filename): +def openfile(filename, mode='r'): path = os.path.join(os.path.dirname(landmark), 'data', filename) - return open(path, 'r') + return open(path, mode) @@ -67,10 +73,10 @@ class TestEmailBase(unittest.TestCase): # Python 2.1 ndiffAssertEqual = unittest.TestCase.assertEqual - def _msgobj(self, filename): + def _msgobj(self, filename, strict=False): fp = openfile(findfile(filename)) try: - msg = email.message_from_file(fp) + msg = email.message_from_file(fp, strict=strict) finally: fp.close() return msg @@ -184,20 +190,31 @@ class TestMessageAPI(TestEmailBase): eq = self.assertEqual msg = self._msgobj('msg_10.txt') # The outer message is a multipart - eq(msg.get_payload(decode=1), None) + eq(msg.get_payload(decode=True), None) # Subpart 1 is 7bit encoded - eq(msg.get_payload(0).get_payload(decode=1), + eq(msg.get_payload(0).get_payload(decode=True), 'This is a 7bit encoded message.\n') # Subpart 2 is quopri - eq(msg.get_payload(1).get_payload(decode=1), + eq(msg.get_payload(1).get_payload(decode=True), '\xa1This is a Quoted Printable encoded message!\n') # Subpart 3 is base64 - eq(msg.get_payload(2).get_payload(decode=1), + eq(msg.get_payload(2).get_payload(decode=True), 'This is a Base64 encoded message.') # Subpart 4 has no Content-Transfer-Encoding: header. - eq(msg.get_payload(3).get_payload(decode=1), + eq(msg.get_payload(3).get_payload(decode=True), 'This has no Content-Transfer-Encoding: header.\n') + def test_get_decoded_uu_payload(self): + eq = self.assertEqual + msg = Message() + msg.set_payload('begin 666 -\n+:&5L;&\\@=V]R;&0 \n \nend\n') + for cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): + msg['content-transfer-encoding'] = cte + eq(msg.get_payload(decode=True), 'hello world') + # Now try some bogus data + msg.set_payload('foo') + eq(msg.get_payload(decode=True), 'foo') + def test_decoded_generator(self): eq = self.assertEqual msg = self._msgobj('msg_07.txt') @@ -310,11 +327,11 @@ class TestMessageAPI(TestEmailBase): eq(msg.get_param('charset'), 'iso-2022-jp') msg.set_param('importance', 'high value') eq(msg.get_param('importance'), 'high value') - eq(msg.get_param('importance', unquote=0), '"high value"') + eq(msg.get_param('importance', unquote=False), '"high value"') eq(msg.get_params(), [('text/plain', ''), ('charset', 'iso-2022-jp'), ('importance', 'high value')]) - eq(msg.get_params(unquote=0), [('text/plain', ''), + eq(msg.get_params(unquote=False), [('text/plain', ''), ('charset', '"iso-2022-jp"'), ('importance', '"high value"')]) msg.set_param('charset', 'iso-9999-xx', header='X-Jimmy') @@ -452,6 +469,14 @@ class TestMessageAPI(TestEmailBase): eq(msg.values(), ['One Hundred', 'Twenty', 'Three', 'Eleven']) self.assertRaises(KeyError, msg.replace_header, 'Fourth', 'Missing') + def test_broken_base64_payload(self): + x = 'AwDp0P7//y6LwKEAcPa/6Q=9' + msg = Message() + msg['content-type'] = 'audio/x-midi' + msg['content-transfer-encoding'] = 'base64' + msg.set_payload(x) + self.assertEqual(msg.get_payload(decode=True), x) + # Test the email.Encoders module @@ -459,21 +484,21 @@ class TestEncoders(unittest.TestCase): def test_encode_noop(self): eq = self.assertEqual msg = MIMEText('hello world', _encoder=Encoders.encode_noop) - eq(msg.get_payload(), 'hello world\n') + eq(msg.get_payload(), 'hello world') def test_encode_7bit(self): eq = self.assertEqual msg = MIMEText('hello world', _encoder=Encoders.encode_7or8bit) - eq(msg.get_payload(), 'hello world\n') + eq(msg.get_payload(), 'hello world') eq(msg['content-transfer-encoding'], '7bit') msg = MIMEText('hello \x7f world', _encoder=Encoders.encode_7or8bit) - eq(msg.get_payload(), 'hello \x7f world\n') + eq(msg.get_payload(), 'hello \x7f world') eq(msg['content-transfer-encoding'], '7bit') def test_encode_8bit(self): eq = self.assertEqual msg = MIMEText('hello \x80 world', _encoder=Encoders.encode_7or8bit) - eq(msg.get_payload(), 'hello \x80 world\n') + eq(msg.get_payload(), 'hello \x80 world') eq(msg['content-transfer-encoding'], '8bit') def test_encode_empty_payload(self): @@ -485,13 +510,13 @@ class TestEncoders(unittest.TestCase): def test_encode_base64(self): eq = self.assertEqual msg = MIMEText('hello world', _encoder=Encoders.encode_base64) - eq(msg.get_payload(), 'aGVsbG8gd29ybGQK\n') + eq(msg.get_payload(), 'aGVsbG8gd29ybGQ=') eq(msg['content-transfer-encoding'], 'base64') def test_encode_quoted_printable(self): eq = self.assertEqual msg = MIMEText('hello world', _encoder=Encoders.encode_quopri) - eq(msg.get_payload(), 'hello=20world\n') + eq(msg.get_payload(), 'hello=20world') eq(msg['content-transfer-encoding'], 'quoted-printable') def test_default_cte(self): @@ -560,7 +585,7 @@ bug demonstration g_head = "Die Mieter treten hier ein werden mit einem Foerderband komfortabel den Korridor entlang, an s\xfcdl\xfcndischen Wandgem\xe4lden vorbei, gegen die rotierenden Klingen bef\xf6rdert. " cz_head = "Finan\xe8ni metropole se hroutily pod tlakem jejich d\xf9vtipu.. " utf8_head = u"\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das Nunstuck git und Slotermeyer? Ja! Beiherhund das Oder die Flipperwaldt gersput.\u300d\u3068\u8a00\u3063\u3066\u3044\u307e\u3059\u3002".encode("utf-8") - h = Header(g_head, g) + h = Header(g_head, g, header_name='Subject') h.append(cz_head, cz) h.append(utf8_head, utf8) msg = Message() @@ -568,40 +593,32 @@ bug demonstration sfp = StringIO() g = Generator(sfp) g.flatten(msg) - eq(sfp.getvalue(), '''\ -Subject: =?iso-8859-1?q?Die_Mieter_treten_hier_ein_werden_mit_eine?= - =?iso-8859-1?q?m_Foerderband_komfortabel_den_Korridor_ent?= - =?iso-8859-1?q?lang=2C_an_s=FCdl=FCndischen_Wandgem=E4lden_vorbei?= - =?iso-8859-1?q?=2C_gegen_die_rotierenden_Klingen_bef=F6rdert=2E_?= - =?iso-8859-2?q?Finan=E8ni_metropole_se_hroutil?= - =?iso-8859-2?q?y_pod_tlakem_jejich_d=F9vtipu=2E=2E_?= - =?utf-8?b?5q2j56K644Gr6KiA44GG44Go57+76Kiz44Gv?= - =?utf-8?b?44GV44KM44Gm44GE44G+44Gb44KT44CC5LiA?= - =?utf-8?b?6YOo44Gv44OJ44Kk44OE6Kqe44Gn44GZ44GM?= - =?utf-8?b?44CB44GC44Go44Gv44Gn44Gf44KJ44KB44Gn?= - =?utf-8?b?44GZ44CC5a6f6Zqb44Gr44Gv44CMV2VubiBpc3QgZGE=?= - =?utf-8?q?s_Nunstuck_git_und?= - =?utf-8?q?_Slotermeyer=3F_Ja!_Beiherhund_das_Ode?= - =?utf-8?q?r_die_Flipperwaldt?= - =?utf-8?b?IGdlcnNwdXQu44CN44Go6KiA44Gj44Gm44GE44G+44GZ44CC?= + eq(sfp.getvalue(), """\ +Subject: =?iso-8859-1?q?Die_Mieter_treten_hier_ein_werden_mit_einem_Foerd?= + =?iso-8859-1?q?erband_komfortabel_den_Korridor_entlang=2C_an_s=FCdl=FCndi?= + =?iso-8859-1?q?schen_Wandgem=E4lden_vorbei=2C_gegen_die_rotierenden_Kling?= + =?iso-8859-1?q?en_bef=F6rdert=2E_?= =?iso-8859-2?q?Finan=E8ni_met?= + =?iso-8859-2?q?ropole_se_hroutily_pod_tlakem_jejich_d=F9vtipu=2E=2E_?= + =?utf-8?b?5q2j56K644Gr6KiA44GG44Go57+76Kiz44Gv44GV44KM44Gm44GE?= + =?utf-8?b?44G+44Gb44KT44CC5LiA6YOo44Gv44OJ44Kk44OE6Kqe44Gn44GZ44GM44CB?= + =?utf-8?b?44GC44Go44Gv44Gn44Gf44KJ44KB44Gn44GZ44CC5a6f6Zqb44Gr44Gv44CM?= + =?utf-8?q?Wenn_ist_das_Nunstuck_git_und_Slotermeyer=3F_Ja!_Beiherhund_das?= + =?utf-8?b?IE9kZXIgZGllIEZsaXBwZXJ3YWxkdCBnZXJzcHV0LuOAjeOBqOiogOOBow==?= + =?utf-8?b?44Gm44GE44G+44GZ44CC?= -''') - eq(h.encode(), '''\ -=?iso-8859-1?q?Die_Mieter_treten_hier_ein_werden_mit_eine?= - =?iso-8859-1?q?m_Foerderband_komfortabel_den_Korridor_ent?= - =?iso-8859-1?q?lang=2C_an_s=FCdl=FCndischen_Wandgem=E4lden_vorbei?= - =?iso-8859-1?q?=2C_gegen_die_rotierenden_Klingen_bef=F6rdert=2E_?= - =?iso-8859-2?q?Finan=E8ni_metropole_se_hroutil?= - =?iso-8859-2?q?y_pod_tlakem_jejich_d=F9vtipu=2E=2E_?= - =?utf-8?b?5q2j56K644Gr6KiA44GG44Go57+76Kiz44Gv?= - =?utf-8?b?44GV44KM44Gm44GE44G+44Gb44KT44CC5LiA?= - =?utf-8?b?6YOo44Gv44OJ44Kk44OE6Kqe44Gn44GZ44GM?= - =?utf-8?b?44CB44GC44Go44Gv44Gn44Gf44KJ44KB44Gn?= - =?utf-8?b?44GZ44CC5a6f6Zqb44Gr44Gv44CMV2VubiBpc3QgZGE=?= - =?utf-8?q?s_Nunstuck_git_und?= - =?utf-8?q?_Slotermeyer=3F_Ja!_Beiherhund_das_Ode?= - =?utf-8?q?r_die_Flipperwaldt?= - =?utf-8?b?IGdlcnNwdXQu44CN44Go6KiA44Gj44Gm44GE44G+44GZ44CC?=''') +""") + eq(h.encode(), """\ +=?iso-8859-1?q?Die_Mieter_treten_hier_ein_werden_mit_einem_Foerd?= + =?iso-8859-1?q?erband_komfortabel_den_Korridor_entlang=2C_an_s=FCdl=FCndi?= + =?iso-8859-1?q?schen_Wandgem=E4lden_vorbei=2C_gegen_die_rotierenden_Kling?= + =?iso-8859-1?q?en_bef=F6rdert=2E_?= =?iso-8859-2?q?Finan=E8ni_met?= + =?iso-8859-2?q?ropole_se_hroutily_pod_tlakem_jejich_d=F9vtipu=2E=2E_?= + =?utf-8?b?5q2j56K644Gr6KiA44GG44Go57+76Kiz44Gv44GV44KM44Gm44GE?= + =?utf-8?b?44G+44Gb44KT44CC5LiA6YOo44Gv44OJ44Kk44OE6Kqe44Gn44GZ44GM44CB?= + =?utf-8?b?44GC44Go44Gv44Gn44Gf44KJ44KB44Gn44GZ44CC5a6f6Zqb44Gr44Gv44CM?= + =?utf-8?q?Wenn_ist_das_Nunstuck_git_und_Slotermeyer=3F_Ja!_Beiherhund_das?= + =?utf-8?b?IE9kZXIgZGllIEZsaXBwZXJ3YWxkdCBnZXJzcHV0LuOAjeOBqOiogOOBow==?= + =?utf-8?b?44Gm44GE44G+44GZ44CC?=""") def test_long_header_encode(self): eq = self.ndiffAssertEqual @@ -706,12 +723,13 @@ from modemcable093.139-201-24.que.mc.videotron.ca ([24.201.139.93] def test_long_8bit_header(self): eq = self.ndiffAssertEqual msg = Message() - h = Header('Britische Regierung gibt', 'iso-8859-1') + h = Header('Britische Regierung gibt', 'iso-8859-1', + header_name='Subject') h.append('gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte') msg['Subject'] = h eq(msg.as_string(), """\ -Subject: =?iso-8859-1?q?Britische_Regierung_gibt?= - =?iso-8859-1?q?gr=FCnes_Licht_f=FCr_Offshore-Windkraftprojekte?= +Subject: =?iso-8859-1?q?Britische_Regierung_gibt?= =?iso-8859-1?q?gr=FCnes?= + =?iso-8859-1?q?_Licht_f=FCr_Offshore-Windkraftprojekte?= """) @@ -724,6 +742,121 @@ Reply-To: Britische Regierung gibt gr\xfcnes Licht f\xfcr Offshore-Windkraftproj """) + def test_long_to_header(self): + eq = self.ndiffAssertEqual + to = '"Someone Test #A" ,,"Someone Test #B" , "Someone Test #C" , "Someone Test #D" ' + msg = Message() + msg['To'] = to + eq(msg.as_string(0), '''\ +To: "Someone Test #A" , , +\t"Someone Test #B" , +\t"Someone Test #C" , +\t"Someone Test #D" + +''') + + def test_long_line_after_append(self): + eq = self.ndiffAssertEqual + s = 'This is an example of string which has almost the limit of header length.' + h = Header(s) + h.append('Add another line.') + eq(h.encode(), """\ +This is an example of string which has almost the limit of header length. + Add another line.""") + + def test_shorter_line_with_append(self): + eq = self.ndiffAssertEqual + s = 'This is a shorter line.' + h = Header(s) + h.append('Add another sentence. (Surprise?)') + eq(h.encode(), + 'This is a shorter line. Add another sentence. (Surprise?)') + + def test_long_field_name(self): + eq = self.ndiffAssertEqual + fn = 'X-Very-Very-Very-Long-Header-Name' + gs = "Die Mieter treten hier ein werden mit einem Foerderband komfortabel den Korridor entlang, an s\xfcdl\xfcndischen Wandgem\xe4lden vorbei, gegen die rotierenden Klingen bef\xf6rdert. " + h = Header(gs, 'iso-8859-1', header_name=fn) + # BAW: this seems broken because the first line is too long + eq(h.encode(), """\ +=?iso-8859-1?q?Die_Mieter_treten_hier_?= + =?iso-8859-1?q?ein_werden_mit_einem_Foerderband_komfortabel_den_Korridor_?= + =?iso-8859-1?q?entlang=2C_an_s=FCdl=FCndischen_Wandgem=E4lden_vorbei=2C_g?= + =?iso-8859-1?q?egen_die_rotierenden_Klingen_bef=F6rdert=2E_?=""") + + def test_long_received_header(self): + h = 'from FOO.TLD (vizworld.acl.foo.tld [123.452.678.9]) by hrothgar.la.mastaler.com (tmda-ofmipd) with ESMTP; Wed, 05 Mar 2003 18:10:18 -0700' + msg = Message() + msg['Received-1'] = Header(h, continuation_ws='\t') + msg['Received-2'] = h + self.assertEqual(msg.as_string(), """\ +Received-1: from FOO.TLD (vizworld.acl.foo.tld [123.452.678.9]) by +\throthgar.la.mastaler.com (tmda-ofmipd) with ESMTP; +\tWed, 05 Mar 2003 18:10:18 -0700 +Received-2: from FOO.TLD (vizworld.acl.foo.tld [123.452.678.9]) by +\throthgar.la.mastaler.com (tmda-ofmipd) with ESMTP; +\tWed, 05 Mar 2003 18:10:18 -0700 + +""") + + def test_string_headerinst_eq(self): + h = '<15975.17901.207240.414604@sgigritzmann1.mathematik.tu-muenchen.de> (David Bremner\'s message of "Thu, 6 Mar 2003 13:58:21 +0100")' + msg = Message() + msg['Received-1'] = Header(h, header_name='Received-1', + continuation_ws='\t') + msg['Received-2'] = h + self.assertEqual(msg.as_string(), """\ +Received-1: <15975.17901.207240.414604@sgigritzmann1.mathematik.tu-muenchen.de> +\t(David Bremner's message of "Thu, 6 Mar 2003 13:58:21 +0100") +Received-2: <15975.17901.207240.414604@sgigritzmann1.mathematik.tu-muenchen.de> +\t(David Bremner's message of "Thu, 6 Mar 2003 13:58:21 +0100") + +""") + + def test_long_unbreakable_lines_with_continuation(self): + eq = self.ndiffAssertEqual + msg = Message() + t = """\ + iVBORw0KGgoAAAANSUhEUgAAADAAAAAwBAMAAAClLOS0AAAAGFBMVEUAAAAkHiJeRUIcGBi9 + locQDQ4zJykFBAXJfWDjAAACYUlEQVR4nF2TQY/jIAyFc6lydlG5x8Nyp1Y69wj1PN2I5gzp""" + msg['Face-1'] = t + msg['Face-2'] = Header(t, header_name='Face-2') + eq(msg.as_string(), """\ +Face-1: iVBORw0KGgoAAAANSUhEUgAAADAAAAAwBAMAAAClLOS0AAAAGFBMVEUAAAAkHiJeRUIcGBi9 +\tlocQDQ4zJykFBAXJfWDjAAACYUlEQVR4nF2TQY/jIAyFc6lydlG5x8Nyp1Y69wj1PN2I5gzp +Face-2: iVBORw0KGgoAAAANSUhEUgAAADAAAAAwBAMAAAClLOS0AAAAGFBMVEUAAAAkHiJeRUIcGBi9 + locQDQ4zJykFBAXJfWDjAAACYUlEQVR4nF2TQY/jIAyFc6lydlG5x8Nyp1Y69wj1PN2I5gzp + +""") + + def test_another_long_multiline_header(self): + eq = self.ndiffAssertEqual + m = '''\ +Received: from siimage.com ([172.25.1.3]) by zima.siliconimage.com with Microsoft SMTPSVC(5.0.2195.4905); + Wed, 16 Oct 2002 07:41:11 -0700''' + msg = email.message_from_string(m) + eq(msg.as_string(), '''\ +Received: from siimage.com ([172.25.1.3]) by zima.siliconimage.com with + Microsoft SMTPSVC(5.0.2195.4905); Wed, 16 Oct 2002 07:41:11 -0700 + +''') + + def test_long_lines_with_different_header(self): + eq = self.ndiffAssertEqual + h = """\ +List-Unsubscribe: , + """ + msg = Message() + msg['List'] = h + msg['List'] = Header(h, header_name='List') + eq(msg.as_string(), """\ +List: List-Unsubscribe: , + +List: List-Unsubscribe: , + + +""") + # Test mangling of "From " lines in the body of a message @@ -738,7 +871,7 @@ Blah blah blah def test_mangled_from(self): s = StringIO() - g = Generator(s, mangle_from_=1) + g = Generator(s, mangle_from_=True) g.flatten(self.msg) self.assertEqual(s.getvalue(), """\ From: aaa@bbb.org @@ -749,7 +882,7 @@ Blah blah blah def test_dont_mangle_from(self): s = StringIO() - g = Generator(s, mangle_from_=0) + g = Generator(s, mangle_from_=False) g.flatten(self.msg) self.assertEqual(s.getvalue(), """\ From: aaa@bbb.org @@ -763,8 +896,13 @@ Blah blah blah # Test the basic MIMEAudio class class TestMIMEAudio(unittest.TestCase): def setUp(self): - # In Python, audiotest.au lives in Lib/test not Lib/test/data - fp = open(findfile('audiotest.au'), 'rb') + # Make sure we pick up the audiotest.au that lives in email/test/data. + # In Python, there's an audiotest.au living in Lib/test but that isn't + # included in some binary distros that don't include the test + # package. The trailing empty string on the .join() is significant + # since findfile() will do a dirname(). + datadir = os.path.join(os.path.dirname(landmark), 'data', '') + fp = open(findfile('audiotest.au', datadir), 'rb') try: self._audiodata = fp.read() finally: @@ -883,7 +1021,7 @@ class TestMIMEText(unittest.TestCase): is missing) def test_payload(self): - self.assertEqual(self._msg.get_payload(), 'hello there\n') + self.assertEqual(self._msg.get_payload(), 'hello there') self.failUnless(not self._msg.is_multipart()) def test_charset(self): @@ -895,7 +1033,7 @@ class TestMIMEText(unittest.TestCase): # Test a more complicated multipart/mixed type message -class TestMultipartMixed(unittest.TestCase): +class TestMultipartMixed(TestEmailBase): def setUp(self): fp = openfile('PyBanner048.gif') try: @@ -978,6 +1116,7 @@ From: bperson@dom.ain ''') def test_one_part_in_a_multipart(self): + eq = self.ndiffAssertEqual outer = MIMEBase('multipart', 'mixed') outer['Subject'] = 'A subject' outer['To'] = 'aperson@dom.ain' @@ -987,7 +1126,7 @@ From: bperson@dom.ain outer.set_boundary('BOUNDARY') msg = MIMEText('hello world') outer.attach(msg) - self.assertEqual(outer.as_string(), '''\ + eq(outer.as_string(), '''\ Content-Type: multipart/mixed; boundary="BOUNDARY" MIME-Version: 1.0 Subject: A subject @@ -1000,11 +1139,11 @@ MIME-Version: 1.0 Content-Transfer-Encoding: 7bit hello world - --BOUNDARY-- ''') def test_seq_parts_in_a_multipart(self): + eq = self.ndiffAssertEqual outer = MIMEBase('multipart', 'mixed') outer['Subject'] = 'A subject' outer['To'] = 'aperson@dom.ain' @@ -1014,7 +1153,7 @@ hello world msg = MIMEText('hello world') outer.attach(msg) outer.set_boundary('BOUNDARY') - self.assertEqual(outer.as_string(), '''\ + eq(outer.as_string(), '''\ Content-Type: multipart/mixed; boundary="BOUNDARY" MIME-Version: 1.0 Subject: A subject @@ -1027,7 +1166,6 @@ MIME-Version: 1.0 Content-Transfer-Encoding: 7bit hello world - --BOUNDARY-- ''') @@ -1048,7 +1186,7 @@ class TestNonConformant(TestEmailBase): data = fp.read() finally: fp.close() - p = Parser(strict=1) + p = Parser(strict=True) # Note, under a future non-strict parsing mode, this would parse the # message into the intended message tree. self.assertRaises(Errors.BoundaryError, p.parsestr, data) @@ -1099,6 +1237,20 @@ message 2 --BOUNDARY-- """) + def test_no_separating_blank_line(self): + eq = self.ndiffAssertEqual + msg = self._msgobj('msg_35.txt') + eq(msg.as_string(), """\ +From: aperson@dom.ain +To: bperson@dom.ain +Subject: here's something interesting + +counter to RFC 2822, there's no separating newline here +""") + # strict=True should raise an exception + self.assertRaises(Errors.HeaderParseError, + self._msgobj, 'msg_35.txt', True) + # Test RFC 2047 header encoding and decoding @@ -1133,6 +1285,31 @@ class TestRFC2047(unittest.TestCase): eq(Utils.encode(s2, charset='iso-8859-2', encoding='b'), '=?iso-8859-2?b?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=') + def test_rfc2047_multiline(self): + eq = self.assertEqual + s = """Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz + foo bar =?mac-iceland?q?r=8Aksm=9Arg=8Cs?=""" + dh = decode_header(s) + eq(dh, [ + ('Re:', None), + ('r\x8aksm\x9arg\x8cs', 'mac-iceland'), + ('baz foo bar', None), + ('r\x8aksm\x9arg\x8cs', 'mac-iceland')]) + eq(str(make_header(dh)), + """Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar + =?mac-iceland?q?r=8Aksm=9Arg=8Cs?=""") + + def test_whitespace_eater_unicode(self): + eq = self.assertEqual + s = '=?ISO-8859-1?Q?Andr=E9?= Pirard ' + dh = decode_header(s) + eq(dh, [('Andr\xe9', 'iso-8859-1'), ('Pirard ', None)]) + # Python 2.1's unicode() builtin doesn't call the object's + # __unicode__() method. Use the following alternative instead. + #hu = unicode(make_header(dh)).encode('latin-1') + hu = make_header(dh).__unicode__().encode('latin-1') + eq(hu, 'Andr\xe9 Pirard ') + # Test the MIMEMessage class @@ -1263,6 +1440,7 @@ Your message cannot be delivered to the following recipients: '<002001c144a6$8752e060$56104586@oxy.edu>') def test_epilogue(self): + eq = self.ndiffAssertEqual fp = openfile('msg_21.txt') try: text = fp.read() @@ -1282,7 +1460,42 @@ Your message cannot be delivered to the following recipients: sfp = StringIO() g = Generator(sfp) g.flatten(msg) - self.assertEqual(sfp.getvalue(), text) + eq(sfp.getvalue(), text) + + def test_no_nl_preamble(self): + eq = self.ndiffAssertEqual + msg = Message() + msg['From'] = 'aperson@dom.ain' + msg['To'] = 'bperson@dom.ain' + msg['Subject'] = 'Test' + msg.preamble = 'MIME message' + msg.epilogue = '' + msg1 = MIMEText('One') + msg2 = MIMEText('Two') + msg.add_header('Content-Type', 'multipart/mixed', boundary='BOUNDARY') + msg.attach(msg1) + msg.attach(msg2) + eq(msg.as_string(), """\ +From: aperson@dom.ain +To: bperson@dom.ain +Subject: Test +Content-Type: multipart/mixed; boundary="BOUNDARY" + +MIME message +--BOUNDARY +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + +One +--BOUNDARY +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + +Two +--BOUNDARY-- +""") def test_default_type(self): eq = self.assertEqual @@ -1494,6 +1707,10 @@ class TestIdempotent(TestEmailBase): msg, text = self._msgobj('msg_33.txt') self._idempotent(msg, text) + def test_text_plain_in_a_multipart_digest(self): + msg, text = self._msgobj('msg_34.txt') + self._idempotent(msg, text) + def test_content_type(self): eq = self.assertEquals unless = self.failUnless @@ -1640,12 +1857,17 @@ class TestMiscellaneous(unittest.TestCase): def test_formatdate_localtime(self): now = time.time() self.assertEqual( - Utils.parsedate(Utils.formatdate(now, localtime=1))[:6], + Utils.parsedate(Utils.formatdate(now, localtime=True))[:6], time.localtime(now)[:6]) def test_parsedate_none(self): self.assertEqual(Utils.parsedate(''), None) + def test_parsedate_compact(self): + # The FWS after the comma is optional + self.assertEqual(Utils.parsedate('Wed,3 Apr 2002 14:58:26 +0800'), + Utils.parsedate('Wed, 3 Apr 2002 14:58:26 +0800')) + def test_parseaddr_empty(self): self.assertEqual(Utils.parseaddr('<>'), ('', '')) self.assertEqual(Utils.formataddr(Utils.parseaddr('<>')), '') @@ -1663,6 +1885,23 @@ class TestMiscellaneous(unittest.TestCase): b = 'person@dom.ain' self.assertEqual(Utils.parseaddr(Utils.formataddr((a, b))), (a, b)) + def test_escape_backslashes(self): + self.assertEqual( + Utils.formataddr(('Arthur \Backslash\ Foobar', 'person@dom.ain')), + r'"Arthur \\Backslash\\ Foobar" ') + a = r'Arthur \Backslash\ Foobar' + b = 'person@dom.ain' + self.assertEqual(Utils.parseaddr(Utils.formataddr((a, b))), (a, b)) + + def test_name_with_dot(self): + x = 'John X. Doe ' + y = '"John X. Doe" ' + a, b = ('John X. Doe', 'jxd@example.com') + self.assertEqual(Utils.parseaddr(x), (a, b)) + self.assertEqual(Utils.parseaddr(y), (a, b)) + # formataddr() quotes the name if there's a dot in it + self.assertEqual(Utils.formataddr((a, b)), y) + def test_quote_dump(self): self.assertEqual( Utils.formataddr(('A Silly; Person', 'person@dom.ain')), @@ -1703,6 +1942,16 @@ class TestMiscellaneous(unittest.TestCase): [('Al Person', 'aperson@dom.ain'), ('Bud Person', 'bperson@dom.ain')]) + def test_getaddresses_nasty(self): + eq = self.assertEqual + eq(Utils.getaddresses(['foo: ;']), [('', '')]) + eq(Utils.getaddresses( + ['[]*-- =~$']), + [('', ''), ('', ''), ('', '*--')]) + eq(Utils.getaddresses( + ['foo: ;', '"Jason R. Mastaler" ']), + [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')]) + def test_utils_quote_unquote(self): eq = self.assertEqual msg = Message() @@ -1839,11 +2088,8 @@ Here's the message body eq(msg.get_payload(), "Here's the message body\n") def test_crlf_separation(self): - if sys.platform == 'mac': - # Skipped in MacPython 2.2.X due to line-end problems - return eq = self.assertEqual - fp = openfile('msg_26.txt') + fp = openfile('msg_26.txt', mode='rb') try: msg = Parser().parse(fp) finally: @@ -1950,7 +2196,7 @@ eHh4eCB4eHh4IA==\r # Test the charset option eq(he('hello', charset='iso-8859-2'), '=?iso-8859-2?b?aGVsbG8=?=') # Test the keep_eols flag - eq(he('hello\nworld', keep_eols=1), + eq(he('hello\nworld', keep_eols=True), '=?iso-8859-1?b?aGVsbG8Kd29ybGQ=?=') # Test the maxlinelen argument eq(he('xxxx ' * 20, maxlinelen=40), """\ @@ -2029,7 +2275,7 @@ class TestQuopri(unittest.TestCase): # Test the charset option eq(he('hello', charset='iso-8859-2'), '=?iso-8859-2?q?hello?=') # Test the keep_eols flag - eq(he('hello\nworld', keep_eols=1), '=?iso-8859-1?q?hello=0Aworld?=') + eq(he('hello\nworld', keep_eols=True), '=?iso-8859-1?q?hello=0Aworld?=') # Test a non-ASCII character eq(he('hello\xc7there'), '=?iso-8859-1?q?hello=C7there?=') # Test the maxlinelen argument @@ -2083,6 +2329,13 @@ two line""") # Test the Charset class class TestCharset(unittest.TestCase): + def tearDown(self): + from email import Charset as CharsetModule + try: + del CharsetModule.CHARSETS['fake'] + except KeyError: + pass + def test_idempotent(self): eq = self.assertEqual # Make sure us-ascii = no Unicode conversion @@ -2095,6 +2348,36 @@ class TestCharset(unittest.TestCase): sp = c.to_splittable(s) eq(s, c.from_splittable(sp)) + def test_body_encode(self): + eq = self.assertEqual + # Try a charset with QP body encoding + c = Charset('iso-8859-1') + eq('hello w=F6rld', c.body_encode('hello w\xf6rld')) + # Try a charset with Base64 body encoding + c = Charset('utf-8') + eq('aGVsbG8gd29ybGQ=\n', c.body_encode('hello world')) + # Try a charset with None body encoding + c = Charset('us-ascii') + eq('hello world', c.body_encode('hello world')) + # Try the convert argument, where input codec <> output codec + c = Charset('euc-jp') + # With apologies to Tokio Kikuchi ;) + try: + eq('\x1b$B5FCO;~IW\x1b(B', + c.body_encode('\xb5\xc6\xc3\xcf\xbb\xfe\xc9\xd7')) + eq('\xb5\xc6\xc3\xcf\xbb\xfe\xc9\xd7', + c.body_encode('\xb5\xc6\xc3\xcf\xbb\xfe\xc9\xd7', False)) + except LookupError: + # We probably don't have the Japanese codecs installed + pass + # Testing SF bug #625509, which we have to fake, since there are no + # built-in encodings where the header encoding is QP but the body + # encoding is not. + from email import Charset as CharsetModule + CharsetModule.add_charset('fake', CharsetModule.QP, None) + c = Charset('fake') + eq('hello w\xf6rld', c.body_encode('hello w\xf6rld')) + # Test multilingual MIME headers. @@ -2104,14 +2387,14 @@ class TestHeader(TestEmailBase): h = Header('Hello World!') eq(h.encode(), 'Hello World!') h.append(' Goodbye World!') - eq(h.encode(), 'Hello World! Goodbye World!') + eq(h.encode(), 'Hello World! Goodbye World!') def test_simple_surprise(self): eq = self.ndiffAssertEqual h = Header('Hello World!') eq(h.encode(), 'Hello World!') h.append('Goodbye World!') - eq(h.encode(), 'Hello World!Goodbye World!') + eq(h.encode(), 'Hello World! Goodbye World!') def test_header_needs_no_decoding(self): h = 'no decoding needed' @@ -2120,7 +2403,7 @@ class TestHeader(TestEmailBase): def test_long(self): h = Header("I am the very model of a modern Major-General; I've information vegetable, animal, and mineral; I know the kings of England, and I quote the fights historical from Marathon to Waterloo, in order categorical; I'm very well acquainted, too, with matters mathematical; I understand equations, both the simple and quadratical; about binomial theorem I'm teeming with a lot o' news, with many cheerful facts about the square of the hypotenuse.", maxlinelen=76) - for l in h.encode().split('\n '): + for l in h.encode(splitchars=' ').split('\n '): self.failUnless(len(l) <= 76) def test_multilingual(self): @@ -2135,21 +2418,18 @@ class TestHeader(TestEmailBase): h.append(cz_head, cz) h.append(utf8_head, utf8) enc = h.encode() - eq(enc, """=?iso-8859-1?q?Die_Mieter_treten_hier_ein_werden_mit_eine?= - =?iso-8859-1?q?m_Foerderband_komfortabel_den_Korridor_ent?= - =?iso-8859-1?q?lang=2C_an_s=FCdl=FCndischen_Wandgem=E4lden_vorbei?= - =?iso-8859-1?q?=2C_gegen_die_rotierenden_Klingen_bef=F6rdert=2E_?= - =?iso-8859-2?q?Finan=E8ni_metropole_se_hroutil?= - =?iso-8859-2?q?y_pod_tlakem_jejich_d=F9vtipu=2E=2E_?= - =?utf-8?b?5q2j56K644Gr6KiA44GG44Go57+76Kiz44Gv?= - =?utf-8?b?44GV44KM44Gm44GE44G+44Gb44KT44CC5LiA?= - =?utf-8?b?6YOo44Gv44OJ44Kk44OE6Kqe44Gn44GZ44GM?= - =?utf-8?b?44CB44GC44Go44Gv44Gn44Gf44KJ44KB44Gn?= - =?utf-8?b?44GZ44CC5a6f6Zqb44Gr44Gv44CMV2VubiBpc3QgZGE=?= - =?utf-8?q?s_Nunstuck_git_und?= - =?utf-8?q?_Slotermeyer=3F_Ja!_Beiherhund_das_Ode?= - =?utf-8?q?r_die_Flipperwaldt?= - =?utf-8?b?IGdlcnNwdXQu44CN44Go6KiA44Gj44Gm44GE44G+44GZ44CC?=""") + eq(enc, """\ +=?iso-8859-1?q?Die_Mieter_treten_hier_ein_werden_mit_einem_Foerderband_ko?= + =?iso-8859-1?q?mfortabel_den_Korridor_entlang=2C_an_s=FCdl=FCndischen_Wan?= + =?iso-8859-1?q?dgem=E4lden_vorbei=2C_gegen_die_rotierenden_Klingen_bef=F6?= + =?iso-8859-1?q?rdert=2E_?= =?iso-8859-2?q?Finan=E8ni_metropole_se_hroutily?= + =?iso-8859-2?q?_pod_tlakem_jejich_d=F9vtipu=2E=2E_?= =?utf-8?b?5q2j56K6?= + =?utf-8?b?44Gr6KiA44GG44Go57+76Kiz44Gv44GV44KM44Gm44GE44G+44Gb44KT44CC?= + =?utf-8?b?5LiA6YOo44Gv44OJ44Kk44OE6Kqe44Gn44GZ44GM44CB44GC44Go44Gv44Gn?= + =?utf-8?b?44Gf44KJ44KB44Gn44GZ44CC5a6f6Zqb44Gr44Gv44CMV2VubiBpc3QgZGFz?= + =?utf-8?q?_Nunstuck_git_und_Slotermeyer=3F_Ja!_Beiherhund_das_Oder_die_Fl?= + =?utf-8?b?aXBwZXJ3YWxkdCBnZXJzcHV0LuOAjeOBqOiogOOBo+OBpuOBhOOBvuOBmQ==?= + =?utf-8?b?44CC?=""") eq(decode_header(enc), [(g_head, "iso-8859-1"), (cz_head, "iso-8859-2"), (utf8_head, "utf-8")]) @@ -2230,6 +2510,41 @@ A very long line that must get split to something other than at the h = Header(u'\u83ca\u5730\u6642\u592b', 'utf-8') eq(h.encode(), '=?utf-8?b?6I+K5Zyw5pmC5aSr?=') + def test_bad_8bit_header(self): + raises = self.assertRaises + eq = self.assertEqual + x = 'Ynwp4dUEbay Auction Semiar- No Charge \x96 Earn Big' + raises(UnicodeError, Header, x) + h = Header() + raises(UnicodeError, h.append, x) + eq(str(Header(x, errors='replace')), x) + h.append(x, errors='replace') + eq(str(h), x) + + def test_encoded_adjacent_nonencoded(self): + eq = self.assertEqual + h = Header() + h.append('hello', 'iso-8859-1') + h.append('world') + s = h.encode() + eq(s, '=?iso-8859-1?q?hello?= world') + h = make_header(decode_header(s)) + eq(h.encode(), s) + + def test_whitespace_eater(self): + eq = self.assertEqual + s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.' + parts = decode_header(s) + eq(parts, [('Subject:', None), ('\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), ('zz.', None)]) + hdr = make_header(parts) + eq(hdr.encode(), + 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztnK?= zz.') + + def test_broken_base64_header(self): + raises = self.assertRaises + s = 'Subject: =?EUC-KR?B?CSixpLDtKSC/7Liuvsax4iC6uLmwMcijIKHaILzSwd/H0SC8+LCjwLsgv7W/+Mj3IQ?=' + raises(Errors.HeaderParseError, decode_header, s) + # Test RFC 2231 header parameters (en/de)coding @@ -2239,7 +2554,7 @@ class TestRFC2231(TestEmailBase): msg = self._msgobj('msg_29.txt') eq(msg.get_param('title'), ('us-ascii', 'en', 'This is even more ***fun*** isn\'t it!')) - eq(msg.get_param('title', unquote=0), + eq(msg.get_param('title', unquote=False), ('us-ascii', 'en', '"This is even more ***fun*** isn\'t it!"')) def test_set_param(self): @@ -2314,6 +2629,17 @@ Do you like this message? msg = self._msgobj('msg_32.txt') eq(msg.get_content_charset(), 'us-ascii') + def test_rfc2231_no_language_or_charset(self): + m = '''\ +Content-Transfer-Encoding: 8bit +Content-Disposition: inline; filename="file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm" +Content-Type: text/html; NAME*0=file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEM; NAME*1=P_nsmail.htm + +''' + msg = email.message_from_string(m) + self.assertEqual(msg.get_param('NAME'), + (None, None, 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')) + def _testclasses(): diff --git a/Lib/email/test/test_email_codecs.py b/Lib/email/test/test_email_codecs.py index a9a500e..cd8486a 100644 --- a/Lib/email/test/test_email_codecs.py +++ b/Lib/email/test/test_email_codecs.py @@ -28,7 +28,14 @@ class TestEmailAsianCodecs(TestEmailBase): ghello = 'Gr\xfc\xdf Gott!' h.append(jhello, j) h.append(ghello, g) - eq(h.encode(), 'Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?=\n =?iso-8859-1?q?Gr=FC=DF_Gott!?=') + # BAW: This used to -- and maybe should -- fold the two iso-8859-1 + # chunks into a single encoded word. However it doesn't violate the + # standard to have them as two encoded chunks and maybe it's + # reasonable for each .append() call to result in a separate + # encoded word. + eq(h.encode(), """\ +Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?= + =?iso-8859-1?q?Gr=FC=DF?= =?iso-8859-1?q?_Gott!?=""") eq(decode_header(h.encode()), [('Hello World!', None), ('\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'), @@ -37,23 +44,12 @@ class TestEmailAsianCodecs(TestEmailBase): h = Header(long, j, header_name="Subject") # test a very long header enc = h.encode() - # BAW: The following used to pass. Sadly, the test afterwards is what - # happens now. I've no idea which is right. Please, any Japanese and - # RFC 2047 experts, please verify! -## eq(enc, '''\ -##=?iso-2022-jp?b?dGVzdC1qYSAbJEIkWEVqOUYkNSRsJD8lYRsoQg==?= -## =?iso-2022-jp?b?GyRCITwlayRPO0oycTxUJE4+NRsoQg==?= -## =?iso-2022-jp?b?GyRCRyckckJUJEMkRiQkJF4kORsoQg==?=''') - eq(enc, """\ -=?iso-2022-jp?b?dGVzdC1qYSAbJEIkWEVqOUYkNSRsJD8lYRsoQg==?= - =?iso-2022-jp?b?GyRCITwlayRPO0oycTxUJE4+NUcnJHJCVCRDJEYkJCReJDkbKEI=?=""") - # BAW: same deal here. :( -## self.assertEqual( -## decode_header(enc), -## [("test-ja \x1b$B$XEj9F$5$l$?%a\x1b(B\x1b$B!<%k$O;J2q5\x1b(B\x1b$BG'$rBT$C$F$$$^$9\x1b(B", 'iso-2022-jp')]) - self.assertEqual( - decode_header(enc), - [("test-ja \x1b$B$XEj9F$5$l$?%a\x1b(B\x1b$B!<%k$O;J2q5G'$rBT$C$F$$$^$9\x1b(B", 'iso-2022-jp')]) + # TK: splitting point may differ by codec design and/or Header encoding + eq(enc , """\ +=?iso-2022-jp?b?dGVzdC1qYSAbJEIkWEVqOUYkNSRsJD8lYSE8JWskTztKGyhC?= + =?iso-2022-jp?b?GyRCMnE8VCROPjVHJyRyQlQkQyRGJCQkXiQ5GyhC?=""") + # TK: full decode comparison + eq(h.__unicode__().encode('euc-jp'), long) -- cgit v0.12