diff options
Diffstat (limited to 'Lib/email')
| -rw-r--r-- | Lib/email/_encoded_words.py | 2 | ||||
| -rw-r--r-- | Lib/email/_header_value_parser.py | 79 | ||||
| -rw-r--r-- | Lib/email/contentmanager.py | 249 | ||||
| -rw-r--r-- | Lib/email/encoders.py | 17 | ||||
| -rw-r--r-- | Lib/email/feedparser.py | 85 | ||||
| -rw-r--r-- | Lib/email/generator.py | 13 | ||||
| -rw-r--r-- | Lib/email/header.py | 2 | ||||
| -rw-r--r-- | Lib/email/headerregistry.py | 3 | ||||
| -rw-r--r-- | Lib/email/iterators.py | 6 | ||||
| -rw-r--r-- | Lib/email/message.py | 283 | ||||
| -rw-r--r-- | Lib/email/mime/nonmultipart.py | 2 | ||||
| -rw-r--r-- | Lib/email/mime/text.py | 1 | ||||
| -rw-r--r-- | Lib/email/parser.py | 11 | ||||
| -rw-r--r-- | Lib/email/policy.py | 13 | ||||
| -rw-r--r-- | Lib/email/quoprimime.py | 1 | ||||
| -rw-r--r-- | Lib/email/utils.py | 69 |
16 files changed, 679 insertions, 157 deletions
diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py index 9e0cc75..5eaab36 100644 --- a/Lib/email/_encoded_words.py +++ b/Lib/email/_encoded_words.py @@ -152,7 +152,7 @@ def decode(ew): then from the resulting bytes into unicode using the specified charset. If the cte-decoded string does not successfully decode using the specified character set, a defect is added to the defects list and the unknown octets - are replaced by the unicode 'unknown' character \uFDFF. + are replaced by the unicode 'unknown' character \\uFDFF. The specified charset and language are returned. The default for language, which is rarely if ever encountered, is the empty string. diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 0369e01..a9bdf44 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -70,7 +70,8 @@ XXX: provide complete list of token types. import re import urllib # For urllib.parse.unquote from string import hexdigits -from collections import namedtuple, OrderedDict +from collections import OrderedDict +from operator import itemgetter from email import _encoded_words as _ew from email import errors from email import utils @@ -368,8 +369,7 @@ class TokenList(list): yield (indent + ' !! invalid element in token ' 'list: {!r}'.format(token)) else: - for line in token._pp(indent+' '): - yield line + yield from token._pp(indent+' ') if self.defects: extra = ' Defects: {}'.format(self.defects) else: @@ -1099,15 +1099,34 @@ class MimeParameters(TokenList): params[name] = [] params[name].append((token.section_number, token)) for name, parts in params.items(): - parts = sorted(parts) - # XXX: there might be more recovery we could do here if, for - # example, this is really a case of a duplicate attribute name. + parts = sorted(parts, key=itemgetter(0)) + first_param = parts[0][1] + charset = first_param.charset + # Our arbitrary error recovery is to ignore duplicate parameters, + # to use appearance order if there are duplicate rfc 2231 parts, + # and to ignore gaps. This mimics the error recovery of get_param. + if not first_param.extended and len(parts) > 1: + if parts[1][0] == 0: + parts[1][1].defects.append(errors.InvalidHeaderDefect( + 'duplicate parameter name; duplicate(s) ignored')) + parts = parts[:1] + # Else assume the *0* was missing...note that this is different + # from get_param, but we registered a defect for this earlier. value_parts = [] - charset = parts[0][1].charset - for i, (section_number, param) in enumerate(parts): + i = 0 + for section_number, param in parts: if section_number != i: - param.defects.append(errors.InvalidHeaderDefect( - "inconsistent multipart parameter numbering")) + # We could get fancier here and look for a complete + # duplicate extended parameter and ignore the second one + # seen. But we're not doing that. The old code didn't. + if not param.extended: + param.defects.append(errors.InvalidHeaderDefect( + 'duplicate parameter name; duplicate ignored')) + continue + else: + param.defects.append(errors.InvalidHeaderDefect( + "inconsistent RFC2231 parameter numbering")) + i += 1 value = param.param_value if param.extended: try: @@ -1315,24 +1334,22 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker') # Parser # -"""Parse strings according to RFC822/2047/2822/5322 rules. - -This is a stateless parser. Each get_XXX function accepts a string and -returns either a Terminal or a TokenList representing the RFC object named -by the method and a string containing the remaining unparsed characters -from the input. Thus a parser method consumes the next syntactic construct -of a given type and returns a token representing the construct plus the -unparsed remainder of the input string. - -For example, if the first element of a structured header is a 'phrase', -then: - - phrase, value = get_phrase(value) - -returns the complete phrase from the start of the string value, plus any -characters left in the string after the phrase is removed. - -""" +# Parse strings according to RFC822/2047/2822/5322 rules. +# +# This is a stateless parser. Each get_XXX function accepts a string and +# returns either a Terminal or a TokenList representing the RFC object named +# by the method and a string containing the remaining unparsed characters +# from the input. Thus a parser method consumes the next syntactic construct +# of a given type and returns a token representing the construct plus the +# unparsed remainder of the input string. +# +# For example, if the first element of a structured header is a 'phrase', +# then: +# +# phrase, value = get_phrase(value) +# +# returns the complete phrase from the start of the string value, plus any +# characters left in the string after the phrase is removed. _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split _non_atom_end_matcher = re.compile(r"[^{}]+".format( @@ -2900,7 +2917,7 @@ def parse_content_disposition_header(value): try: token, value = get_token(value) except errors.HeaderParseError: - ctype.defects.append(errors.InvalidHeaderDefect( + disp_header.defects.append(errors.InvalidHeaderDefect( "Expected content disposition but found {!r}".format(value))) _find_mime_parameters(disp_header, value) return disp_header @@ -2931,8 +2948,8 @@ def parse_content_transfer_encoding_header(value): try: token, value = get_token(value) except errors.HeaderParseError: - ctype.defects.append(errors.InvalidHeaderDefect( - "Expected content trnasfer encoding but found {!r}".format(value))) + cte_header.defects.append(errors.InvalidHeaderDefect( + "Expected content transfer encoding but found {!r}".format(value))) else: cte_header.append(token) cte_header.cte = token.value.strip().lower() diff --git a/Lib/email/contentmanager.py b/Lib/email/contentmanager.py new file mode 100644 index 0000000..d363652 --- /dev/null +++ b/Lib/email/contentmanager.py @@ -0,0 +1,249 @@ +import binascii +import email.charset +import email.message +import email.errors +from email import quoprimime + +class ContentManager: + + def __init__(self): + self.get_handlers = {} + self.set_handlers = {} + + def add_get_handler(self, key, handler): + self.get_handlers[key] = handler + + def get_content(self, msg, *args, **kw): + content_type = msg.get_content_type() + if content_type in self.get_handlers: + return self.get_handlers[content_type](msg, *args, **kw) + maintype = msg.get_content_maintype() + if maintype in self.get_handlers: + return self.get_handlers[maintype](msg, *args, **kw) + if '' in self.get_handlers: + return self.get_handlers[''](msg, *args, **kw) + raise KeyError(content_type) + + def add_set_handler(self, typekey, handler): + self.set_handlers[typekey] = handler + + def set_content(self, msg, obj, *args, **kw): + if msg.get_content_maintype() == 'multipart': + # XXX: is this error a good idea or not? We can remove it later, + # but we can't add it later, so do it for now. + raise TypeError("set_content not valid on multipart") + handler = self._find_set_handler(msg, obj) + msg.clear_content() + handler(msg, obj, *args, **kw) + + def _find_set_handler(self, msg, obj): + full_path_for_error = None + for typ in type(obj).__mro__: + if typ in self.set_handlers: + return self.set_handlers[typ] + qname = typ.__qualname__ + modname = getattr(typ, '__module__', '') + full_path = '.'.join((modname, qname)) if modname else qname + if full_path_for_error is None: + full_path_for_error = full_path + if full_path in self.set_handlers: + return self.set_handlers[full_path] + if qname in self.set_handlers: + return self.set_handlers[qname] + name = typ.__name__ + if name in self.set_handlers: + return self.set_handlers[name] + if None in self.set_handlers: + return self.set_handlers[None] + raise KeyError(full_path_for_error) + + +raw_data_manager = ContentManager() + + +def get_text_content(msg, errors='replace'): + content = msg.get_payload(decode=True) + charset = msg.get_param('charset', 'ASCII') + return content.decode(charset, errors=errors) +raw_data_manager.add_get_handler('text', get_text_content) + + +def get_non_text_content(msg): + return msg.get_payload(decode=True) +for maintype in 'audio image video application'.split(): + raw_data_manager.add_get_handler(maintype, get_non_text_content) + + +def get_message_content(msg): + return msg.get_payload(0) +for subtype in 'rfc822 external-body'.split(): + raw_data_manager.add_get_handler('message/'+subtype, get_message_content) + + +def get_and_fixup_unknown_message_content(msg): + # If we don't understand a message subtype, we are supposed to treat it as + # if it were application/octet-stream, per + # tools.ietf.org/html/rfc2046#section-5.2.4. Feedparser doesn't do that, + # so do our best to fix things up. Note that it is *not* appropriate to + # model message/partial content as Message objects, so they are handled + # here as well. (How to reassemble them is out of scope for this comment :) + return bytes(msg.get_payload(0)) +raw_data_manager.add_get_handler('message', + get_and_fixup_unknown_message_content) + + +def _prepare_set(msg, maintype, subtype, headers): + msg['Content-Type'] = '/'.join((maintype, subtype)) + if headers: + if not hasattr(headers[0], 'name'): + mp = msg.policy + headers = [mp.header_factory(*mp.header_source_parse([header])) + for header in headers] + try: + for header in headers: + if header.defects: + raise header.defects[0] + msg[header.name] = header + except email.errors.HeaderDefect as exc: + raise ValueError("Invalid header: {}".format( + header.fold(policy=msg.policy))) from exc + + +def _finalize_set(msg, disposition, filename, cid, params): + if disposition is None and filename is not None: + disposition = 'attachment' + if disposition is not None: + msg['Content-Disposition'] = disposition + if filename is not None: + msg.set_param('filename', + filename, + header='Content-Disposition', + replace=True) + if cid is not None: + msg['Content-ID'] = cid + if params is not None: + for key, value in params.items(): + msg.set_param(key, value) + + +# XXX: This is a cleaned-up version of base64mime.body_encode. It would +# be nice to drop both this and quoprimime.body_encode in favor of +# enhanced binascii routines that accepted a max_line_length parameter. +def _encode_base64(data, max_line_length): + encoded_lines = [] + unencoded_bytes_per_line = max_line_length * 3 // 4 + for i in range(0, len(data), unencoded_bytes_per_line): + thisline = data[i:i+unencoded_bytes_per_line] + encoded_lines.append(binascii.b2a_base64(thisline).decode('ascii')) + return ''.join(encoded_lines) + + +def _encode_text(string, charset, cte, policy): + lines = string.encode(charset).splitlines() + linesep = policy.linesep.encode('ascii') + def embeded_body(lines): return linesep.join(lines) + linesep + def normal_body(lines): return b'\n'.join(lines) + b'\n' + if cte==None: + # Use heuristics to decide on the "best" encoding. + try: + return '7bit', normal_body(lines).decode('ascii') + except UnicodeDecodeError: + pass + if (policy.cte_type == '8bit' and + max(len(x) for x in lines) <= policy.max_line_length): + return '8bit', normal_body(lines).decode('ascii', 'surrogateescape') + sniff = embeded_body(lines[:10]) + sniff_qp = quoprimime.body_encode(sniff.decode('latin-1'), + policy.max_line_length) + sniff_base64 = binascii.b2a_base64(sniff) + # This is a little unfair to qp; it includes lineseps, base64 doesn't. + if len(sniff_qp) > len(sniff_base64): + cte = 'base64' + else: + cte = 'quoted-printable' + if len(lines) <= 10: + return cte, sniff_qp + if cte == '7bit': + data = normal_body(lines).decode('ascii') + elif cte == '8bit': + data = normal_body(lines).decode('ascii', 'surrogateescape') + elif cte == 'quoted-printable': + data = quoprimime.body_encode(normal_body(lines).decode('latin-1'), + policy.max_line_length) + elif cte == 'base64': + data = _encode_base64(embeded_body(lines), policy.max_line_length) + else: + raise ValueError("Unknown content transfer encoding {}".format(cte)) + return cte, data + + +def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None, + disposition=None, filename=None, cid=None, + params=None, headers=None): + _prepare_set(msg, 'text', subtype, headers) + cte, payload = _encode_text(string, charset, cte, msg.policy) + msg.set_payload(payload) + msg.set_param('charset', + email.charset.ALIASES.get(charset, charset), + replace=True) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +raw_data_manager.add_set_handler(str, set_text_content) + + +def set_message_content(msg, message, subtype="rfc822", cte=None, + disposition=None, filename=None, cid=None, + params=None, headers=None): + if subtype == 'partial': + raise ValueError("message/partial is not supported for Message objects") + if subtype == 'rfc822': + if cte not in (None, '7bit', '8bit', 'binary'): + # http://tools.ietf.org/html/rfc2046#section-5.2.1 mandate. + raise ValueError( + "message/rfc822 parts do not support cte={}".format(cte)) + # 8bit will get coerced on serialization if policy.cte_type='7bit'. We + # may end up claiming 8bit when it isn't needed, but the only negative + # result of that should be a gateway that needs to coerce to 7bit + # having to look through the whole embedded message to discover whether + # or not it actually has to do anything. + cte = '8bit' if cte is None else cte + elif subtype == 'external-body': + if cte not in (None, '7bit'): + # http://tools.ietf.org/html/rfc2046#section-5.2.3 mandate. + raise ValueError( + "message/external-body parts do not support cte={}".format(cte)) + cte = '7bit' + elif cte is None: + # http://tools.ietf.org/html/rfc2046#section-5.2.4 says all future + # subtypes should be restricted to 7bit, so assume that. + cte = '7bit' + _prepare_set(msg, 'message', subtype, headers) + msg.set_payload([message]) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +raw_data_manager.add_set_handler(email.message.Message, set_message_content) + + +def set_bytes_content(msg, data, maintype, subtype, cte='base64', + disposition=None, filename=None, cid=None, + params=None, headers=None): + _prepare_set(msg, maintype, subtype, headers) + if cte == 'base64': + data = _encode_base64(data, max_line_length=msg.policy.max_line_length) + elif cte == 'quoted-printable': + # XXX: quoprimime.body_encode won't encode newline characters in data, + # so we can't use it. This means max_line_length is ignored. Another + # bug to fix later. (Note: encoders.quopri is broken on line ends.) + data = binascii.b2a_qp(data, istext=False, header=False, quotetabs=True) + data = data.decode('ascii') + elif cte == '7bit': + # Make sure it really is only ASCII. The early warning here seems + # worth the overhead...if you care write your own content manager :). + data.encode('ascii') + elif cte in ('8bit', 'binary'): + data = data.decode('ascii', 'surrogateescape') + msg.set_payload(data) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +for typ in (bytes, bytearray, memoryview): + raw_data_manager.add_set_handler(typ, set_bytes_content) diff --git a/Lib/email/encoders.py b/Lib/email/encoders.py index f9657f0..0a66acb 100644 --- a/Lib/email/encoders.py +++ b/Lib/email/encoders.py @@ -54,21 +54,12 @@ def encode_7or8bit(msg): # There's no payload. For backwards compatibility we use 7bit msg['Content-Transfer-Encoding'] = '7bit' return - # We play a trick to make this go fast. If encoding/decode to ASCII - # succeeds, we know the data must be 7bit, otherwise treat it as 8bit. + # We play a trick to make this go fast. If decoding from ASCII succeeds, + # we know the data must be 7bit, otherwise treat it as 8bit. try: - if isinstance(orig, str): - orig.encode('ascii') - else: - orig.decode('ascii') + orig.decode('ascii') except UnicodeError: - charset = msg.get_charset() - output_cset = charset and charset.output_charset - # iso-2022-* is non-ASCII but encodes to a 7-bit representation - if output_cset and output_cset.lower().startswith('iso-2022-'): - msg['Content-Transfer-Encoding'] = '7bit' - else: - msg['Content-Transfer-Encoding'] = '8bit' + msg['Content-Transfer-Encoding'] = '8bit' else: msg['Content-Transfer-Encoding'] = '7bit' diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index ea41e95..c95b27f 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -33,7 +33,7 @@ NLCRE_eol = re.compile('(\r\n|\r|\n)\Z') NLCRE_crack = re.compile('(\r\n|\r|\n)') # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character # except controls, SP, and ":". -headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') +headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') EMPTYSTRING = '' NL = '\n' @@ -50,8 +50,8 @@ class BufferedSubFile(object): simple abstraction -- it parses until EOF closes the current message. """ def __init__(self): - # The last partial line pushed into this object. - self._partial = '' + # Chunks of the last partial line pushed into this object. + self._partial = [] # The list of full, pushed lines, in reverse order self._lines = [] # The stack of false-EOF checking predicates. @@ -67,8 +67,8 @@ class BufferedSubFile(object): def close(self): # Don't forget any trailing partial line. - self._lines.append(self._partial) - self._partial = '' + self.pushlines(''.join(self._partial).splitlines(True)) + self._partial = [] self._closed = True def readline(self): @@ -96,26 +96,27 @@ class BufferedSubFile(object): def push(self, data): """Push some new data into this object.""" - # Handle any previous leftovers - data, self._partial = self._partial + data, '' - # Crack into lines, but preserve the newlines on the end of each - parts = NLCRE_crack.split(data) - # The *ahem* interesting behaviour of re.split when supplied grouping - # parentheses is that the last element of the resulting list is the - # data after the final RE. In the case of a NL/CR terminated string, - # this is the empty string. - self._partial = parts.pop() - #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r: - # is there a \n to follow later? - if not self._partial and parts and parts[-1].endswith('\r'): - self._partial = parts.pop(-2)+parts.pop() - # parts is a list of strings, alternating between the line contents - # and the eol character(s). Gather up a list of lines after - # re-attaching the newlines. - lines = [] - for i in range(len(parts) // 2): - lines.append(parts[i*2] + parts[i*2+1]) - self.pushlines(lines) + # Crack into lines, but preserve the linesep characters on the end of each + parts = data.splitlines(True) + + if not parts or not parts[0].endswith(('\n', '\r')): + # No new complete lines, so just accumulate partials + self._partial += parts + return + + if self._partial: + # If there are previous leftovers, complete them now + self._partial.append(parts[0]) + parts[0:1] = ''.join(self._partial).splitlines(True) + del self._partial[:] + + # If the last element of the list does not end in a newline, then treat + # it as a partial line. We only check for '\n' here because a line + # ending with '\r' might be a line that was split in the middle of a + # '\r\n' sequence (see bugs 1555570 and 1721862). + if not parts[-1].endswith('\n'): + self._partial = [parts.pop()] + self.pushlines(parts) def pushlines(self, lines): # Reverse and insert at the front of the lines. @@ -135,7 +136,7 @@ class BufferedSubFile(object): class FeedParser: """A feed-style parser of email.""" - def __init__(self, _factory=message.Message, *, policy=compat32): + def __init__(self, _factory=None, *, policy=compat32): """_factory is called with no arguments to create a new message obj The policy keyword specifies a policy object that controls a number of @@ -143,14 +144,23 @@ class FeedParser: backward compatibility. """ - self._factory = _factory self.policy = policy - try: - _factory(policy=self.policy) - self._factory_kwds = lambda: {'policy': self.policy} - except TypeError: - # Assume this is an old-style factory - self._factory_kwds = lambda: {} + self._factory_kwds = lambda: {'policy': self.policy} + if _factory is None: + # What this should be: + #self._factory = policy.default_message_factory + # but, because we are post 3.4 feature freeze, fix with temp hack: + if self.policy is compat32: + self._factory = message.Message + else: + self._factory = message.EmailMessage + else: + self._factory = _factory + try: + _factory(policy=self.policy) + except TypeError: + # Assume this is an old-style factory + self._factory_kwds = lambda: {} self._input = BufferedSubFile() self._msgstack = [] self._parse = self._parsegen().__next__ @@ -501,6 +511,15 @@ class FeedParser: # There will always be a colon, because if there wasn't the part of # the parser that calls us would have started parsing the body. i = line.find(':') + + # If the colon is on the start of the line the header is clearly + # malformed, but we might be able to salvage the rest of the + # message. Track the error but keep going. + if i == 0: + defect = errors.InvalidHeaderDefect("Missing header name.") + self._cur.defects.append(defect) + continue + assert i>0, "_parse_headers fed line with no : and no leading WS" lastheader = line[:i] lastvalue = [line] diff --git a/Lib/email/generator.py b/Lib/email/generator.py index e4a86d4..4735721 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -10,14 +10,10 @@ import re import sys import time import random -import warnings from copy import deepcopy from io import StringIO, BytesIO -from email._policybase import compat32 -from email.header import Header from email.utils import _has_surrogates -import email.charset as _charset UNDERSCORE = '_' NL = '\n' # XXX: no longer used by the code below. @@ -55,8 +51,9 @@ class Generator: by RFC 2822. The policy keyword specifies a policy object that controls a number of - aspects of the generator's operation. The default policy maintains - backward compatibility. + aspects of the generator's operation. If no policy is specified, + the policy associated with the Message object passed to the + flatten method is used. """ self._fp = outfp @@ -80,7 +77,9 @@ class Generator: Note that for subobjects, no From_ line is printed. linesep specifies the characters used to indicate a new line in - the output. The default value is determined by the policy. + the output. The default value is determined by the policy specified + when the Generator instance was created or, if none was specified, + from the policy associated with the msg. """ # We use the _XXX constants for operating on data that comes directly diff --git a/Lib/email/header.py b/Lib/email/header.py index 5bd0638..9c89589 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -100,7 +100,6 @@ def decode_header(header): words.append((encoded, encoding, charset)) # Now loop over words and remove words that consist of whitespace # between two encoded strings. - import sys droplist = [] for n, w in enumerate(words): if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace(): @@ -362,7 +361,6 @@ class Header: for string, charset in self._chunks: if hasspace is not None: hasspace = string and self._nonctext(string[0]) - import sys if lastcs not in (None, 'us-ascii'): if not hasspace or charset not in (None, 'us-ascii'): formatter.add_transition() diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py index 1fae950..911a2af 100644 --- a/Lib/email/headerregistry.py +++ b/Lib/email/headerregistry.py @@ -7,6 +7,7 @@ Eventually HeaderRegistry will be a public API, but it isn't yet, and will probably change some before that happens. """ +from types import MappingProxyType from email import utils from email import errors @@ -454,7 +455,7 @@ class ParameterizedMIMEHeader: @property def params(self): - return self._params.copy() + return MappingProxyType(self._params) class ContentTypeHeader(ParameterizedMIMEHeader): diff --git a/Lib/email/iterators.py b/Lib/email/iterators.py index 3adc4a0..b5502ee 100644 --- a/Lib/email/iterators.py +++ b/Lib/email/iterators.py @@ -26,8 +26,7 @@ def walk(self): yield self if self.is_multipart(): for subpart in self.get_payload(): - for subsubpart in subpart.walk(): - yield subsubpart + yield from subpart.walk() @@ -40,8 +39,7 @@ def body_line_iterator(msg, decode=False): for subpart in msg.walk(): payload = subpart.get_payload(decode=decode) if isinstance(payload, str): - for line in StringIO(payload): - yield line + yield from StringIO(payload) def typed_subpart_iterator(msg, maintype='text', subtype=None): diff --git a/Lib/email/message.py b/Lib/email/message.py index afe350c..2f37dbb 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -8,8 +8,8 @@ __all__ = ['Message'] import re import uu -import base64 -import binascii +import quopri +import warnings from io import BytesIO, StringIO # Intrapackage imports @@ -132,22 +132,50 @@ class Message: def __str__(self): """Return the entire formatted message as a string. - This includes the headers, body, and envelope header. """ return self.as_string() - def as_string(self, unixfrom=False, maxheaderlen=0): + def as_string(self, unixfrom=False, maxheaderlen=0, policy=None): """Return the entire formatted message as a string. - Optional `unixfrom' when True, means include the Unix From_ envelope - header. - This is a convenience method and may not generate the message exactly - as you intend. For more flexibility, use the flatten() method of a - Generator instance. + Optional 'unixfrom', when true, means include the Unix From_ envelope + header. For backward compatibility reasons, if maxheaderlen is + not specified it defaults to 0, so you must override it explicitly + if you want a different maxheaderlen. 'policy' is passed to the + Generator instance used to serialize the mesasge; if it is not + specified the policy associated with the message instance is used. + + If the message object contains binary data that is not encoded + according to RFC standards, the non-compliant data will be replaced by + unicode "unknown character" code points. """ from email.generator import Generator + policy = self.policy if policy is None else policy fp = StringIO() - g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen) + g = Generator(fp, + mangle_from_=False, + maxheaderlen=maxheaderlen, + policy=policy) + g.flatten(self, unixfrom=unixfrom) + return fp.getvalue() + + def __bytes__(self): + """Return the entire formatted message as a bytes object. + """ + return self.as_bytes() + + def as_bytes(self, unixfrom=False, policy=None): + """Return the entire formatted message as a bytes object. + + Optional 'unixfrom', when true, means include the Unix From_ envelope + header. 'policy' is passed to the BytesGenerator instance used to + serialize the message; if not specified the policy associated with + the message instance is used. + """ + from email.generator import BytesGenerator + policy = self.policy if policy is None else policy + fp = BytesIO() + g = BytesGenerator(fp, mangle_from_=False, policy=policy) g.flatten(self, unixfrom=unixfrom) return fp.getvalue() @@ -177,7 +205,11 @@ class Message: if self._payload is None: self._payload = [payload] else: - self._payload.append(payload) + try: + self._payload.append(payload) + except AttributeError: + raise TypeError("Attach is not valid on a message with a" + " non-multipart payload") def get_payload(self, i=None, decode=False): """Return a reference to the payload. @@ -241,14 +273,14 @@ class Message: bpayload = payload.encode('ascii') except UnicodeError: # This won't happen for RFC compliant messages (messages - # containing only ASCII codepoints in the unicode input). + # containing only ASCII code points in the unicode input). # If it does happen, turn the string into bytes in a way # guaranteed not to fail. bpayload = payload.encode('raw-unicode-escape') if not decode: return payload if cte == 'quoted-printable': - return utils._qdecode(bpayload) + return quopri.decodestring(bpayload) elif cte == 'base64': # XXX: this is a bit of a hack; decode_b should probably be factored # out somewhere, but I haven't figured out where yet. @@ -668,7 +700,7 @@ class Message: return failobj def set_param(self, param, value, header='Content-Type', requote=True, - charset=None, language=''): + charset=None, language='', replace=False): """Set a parameter in the Content-Type header. If the parameter already exists in the header, its value will be @@ -712,8 +744,11 @@ class Message: else: ctype = SEMISPACE.join([ctype, append_param]) if ctype != self.get(header): - del self[header] - self[header] = ctype + if replace: + self.replace_header(header, ctype) + else: + del self[header] + self[header] = ctype def del_param(self, param, header='content-type', requote=True): """Remove the given parameter completely from the Content-Type header. @@ -894,3 +929,219 @@ class Message: # I.e. def walk(self): ... from email.iterators import walk + +# XXX Support for temporary deprecation hack for is_attachment property. +class _IsAttachment: + def __init__(self, value): + self.value = value + def __call__(self): + return self.value + def __bool__(self): + warnings.warn("is_attachment will be a method, not a property, in 3.5", + DeprecationWarning, + stacklevel=3) + return self.value + +class MIMEPart(Message): + + def __init__(self, policy=None): + if policy is None: + from email.policy import default + policy = default + Message.__init__(self, policy) + + @property + def is_attachment(self): + c_d = self.get('content-disposition') + result = False if c_d is None else c_d.content_disposition == 'attachment' + # XXX transitional hack to raise deprecation if not called. + return _IsAttachment(result) + + def _find_body(self, part, preferencelist): + if part.is_attachment(): + return + maintype, subtype = part.get_content_type().split('/') + if maintype == 'text': + if subtype in preferencelist: + yield (preferencelist.index(subtype), part) + return + if maintype != 'multipart': + return + if subtype != 'related': + for subpart in part.iter_parts(): + yield from self._find_body(subpart, preferencelist) + return + if 'related' in preferencelist: + yield (preferencelist.index('related'), part) + candidate = None + start = part.get_param('start') + if start: + for subpart in part.iter_parts(): + if subpart['content-id'] == start: + candidate = subpart + break + if candidate is None: + subparts = part.get_payload() + candidate = subparts[0] if subparts else None + if candidate is not None: + yield from self._find_body(candidate, preferencelist) + + def get_body(self, preferencelist=('related', 'html', 'plain')): + """Return best candidate mime part for display as 'body' of message. + + Do a depth first search, starting with self, looking for the first part + matching each of the items in preferencelist, and return the part + corresponding to the first item that has a match, or None if no items + have a match. If 'related' is not included in preferencelist, consider + the root part of any multipart/related encountered as a candidate + match. Ignore parts with 'Content-Disposition: attachment'. + """ + best_prio = len(preferencelist) + body = None + for prio, part in self._find_body(self, preferencelist): + if prio < best_prio: + best_prio = prio + body = part + if prio == 0: + break + return body + + _body_types = {('text', 'plain'), + ('text', 'html'), + ('multipart', 'related'), + ('multipart', 'alternative')} + def iter_attachments(self): + """Return an iterator over the non-main parts of a multipart. + + Skip the first of each occurrence of text/plain, text/html, + multipart/related, or multipart/alternative in the multipart (unless + they have a 'Content-Disposition: attachment' header) and include all + remaining subparts in the returned iterator. When applied to a + multipart/related, return all parts except the root part. Return an + empty iterator when applied to a multipart/alternative or a + non-multipart. + """ + maintype, subtype = self.get_content_type().split('/') + if maintype != 'multipart' or subtype == 'alternative': + return + parts = self.get_payload() + if maintype == 'multipart' and subtype == 'related': + # For related, we treat everything but the root as an attachment. + # The root may be indicated by 'start'; if there's no start or we + # can't find the named start, treat the first subpart as the root. + start = self.get_param('start') + if start: + found = False + attachments = [] + for part in parts: + if part.get('content-id') == start: + found = True + else: + attachments.append(part) + if found: + yield from attachments + return + parts.pop(0) + yield from parts + return + # Otherwise we more or less invert the remaining logic in get_body. + # This only really works in edge cases (ex: non-text relateds or + # alternatives) if the sending agent sets content-disposition. + seen = [] # Only skip the first example of each candidate type. + for part in parts: + maintype, subtype = part.get_content_type().split('/') + if ((maintype, subtype) in self._body_types and + not part.is_attachment() and subtype not in seen): + seen.append(subtype) + continue + yield part + + def iter_parts(self): + """Return an iterator over all immediate subparts of a multipart. + + Return an empty iterator for a non-multipart. + """ + if self.get_content_maintype() == 'multipart': + yield from self.get_payload() + + def get_content(self, *args, content_manager=None, **kw): + if content_manager is None: + content_manager = self.policy.content_manager + return content_manager.get_content(self, *args, **kw) + + def set_content(self, *args, content_manager=None, **kw): + if content_manager is None: + content_manager = self.policy.content_manager + content_manager.set_content(self, *args, **kw) + + def _make_multipart(self, subtype, disallowed_subtypes, boundary): + if self.get_content_maintype() == 'multipart': + existing_subtype = self.get_content_subtype() + disallowed_subtypes = disallowed_subtypes + (subtype,) + if existing_subtype in disallowed_subtypes: + raise ValueError("Cannot convert {} to {}".format( + existing_subtype, subtype)) + keep_headers = [] + part_headers = [] + for name, value in self._headers: + if name.lower().startswith('content-'): + part_headers.append((name, value)) + else: + keep_headers.append((name, value)) + if part_headers: + # There is existing content, move it to the first subpart. + part = type(self)(policy=self.policy) + part._headers = part_headers + part._payload = self._payload + self._payload = [part] + else: + self._payload = [] + self._headers = keep_headers + self['Content-Type'] = 'multipart/' + subtype + if boundary is not None: + self.set_param('boundary', boundary) + + def make_related(self, boundary=None): + self._make_multipart('related', ('alternative', 'mixed'), boundary) + + def make_alternative(self, boundary=None): + self._make_multipart('alternative', ('mixed',), boundary) + + def make_mixed(self, boundary=None): + self._make_multipart('mixed', (), boundary) + + def _add_multipart(self, _subtype, *args, _disp=None, **kw): + if (self.get_content_maintype() != 'multipart' or + self.get_content_subtype() != _subtype): + getattr(self, 'make_' + _subtype)() + part = type(self)(policy=self.policy) + part.set_content(*args, **kw) + if _disp and 'content-disposition' not in part: + part['Content-Disposition'] = _disp + self.attach(part) + + def add_related(self, *args, **kw): + self._add_multipart('related', *args, _disp='inline', **kw) + + def add_alternative(self, *args, **kw): + self._add_multipart('alternative', *args, **kw) + + def add_attachment(self, *args, **kw): + self._add_multipart('mixed', *args, _disp='attachment', **kw) + + def clear(self): + self._headers = [] + self._payload = None + + def clear_content(self): + self._headers = [(n, v) for n, v in self._headers + if not n.lower().startswith('content-')] + self._payload = None + + +class EmailMessage(MIMEPart): + + def set_content(self, *args, **kw): + super().set_content(*args, **kw) + if 'MIME-Version' not in self: + self['MIME-Version'] = '1.0' diff --git a/Lib/email/mime/nonmultipart.py b/Lib/email/mime/nonmultipart.py index fc3b9eb..e1f5196 100644 --- a/Lib/email/mime/nonmultipart.py +++ b/Lib/email/mime/nonmultipart.py @@ -12,7 +12,7 @@ from email.mime.base import MIMEBase class MIMENonMultipart(MIMEBase): - """Base class for MIME multipart/* type messages.""" + """Base class for MIME non-multipart type messages.""" def attach(self, payload): # The public API prohibits attaching multiple subparts to MIMEBase diff --git a/Lib/email/mime/text.py b/Lib/email/mime/text.py index 3b5b09f..ec18b85 100644 --- a/Lib/email/mime/text.py +++ b/Lib/email/mime/text.py @@ -6,7 +6,6 @@ __all__ = ['MIMEText'] -from email.encoders import encode_7or8bit from email.mime.nonmultipart import MIMENonMultipart diff --git a/Lib/email/parser.py b/Lib/email/parser.py index 752bf35..8c9bc9e 100644 --- a/Lib/email/parser.py +++ b/Lib/email/parser.py @@ -4,19 +4,18 @@ """A parser of RFC 2822 and MIME email messages.""" -__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser'] +__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser', + 'FeedParser', 'BytesFeedParser'] -import warnings from io import StringIO, TextIOWrapper from email.feedparser import FeedParser, BytesFeedParser -from email.message import Message from email._policybase import compat32 class Parser: - def __init__(self, _class=Message, *, policy=compat32): + def __init__(self, _class=None, *, policy=compat32): """Parser of RFC 2822 and MIME email messages. Creates an in-memory object tree representing the email message, which @@ -107,8 +106,10 @@ class BytesParser: meaning it parses the entire contents of the file. """ fp = TextIOWrapper(fp, encoding='ascii', errors='surrogateescape') - with fp: + try: return self.parser.parse(fp, headersonly) + finally: + fp.detach() def parsebytes(self, text, headersonly=False): diff --git a/Lib/email/policy.py b/Lib/email/policy.py index 38e88af..f0b20f4 100644 --- a/Lib/email/policy.py +++ b/Lib/email/policy.py @@ -5,6 +5,7 @@ code that adds all the email6 features. from email._policybase import Policy, Compat32, compat32, _extend_docstrings from email.utils import _has_surrogates from email.headerregistry import HeaderRegistry as HeaderRegistry +from email.contentmanager import raw_data_manager __all__ = [ 'Compat32', @@ -58,10 +59,22 @@ class EmailPolicy(Policy): special treatment, while all other fields are treated as unstructured. This list will be completed before the extension is marked stable.) + + content_manager -- an object with at least two methods: get_content + and set_content. When the get_content or + set_content method of a Message object is called, + it calls the corresponding method of this object, + passing it the message object as its first argument, + and any arguments or keywords that were passed to + it as additional arguments. The default + content_manager is + :data:`~email.contentmanager.raw_data_manager`. + """ refold_source = 'long' header_factory = HeaderRegistry() + content_manager = raw_data_manager def __init__(self, **kw): # Ensure that each new instance gets a unique header factory diff --git a/Lib/email/quoprimime.py b/Lib/email/quoprimime.py index 30bf916..c1fe2b4 100644 --- a/Lib/email/quoprimime.py +++ b/Lib/email/quoprimime.py @@ -40,7 +40,6 @@ __all__ = [ ] import re -import io from string import ascii_letters, digits, hexdigits diff --git a/Lib/email/utils.py b/Lib/email/utils.py index f76c21e..5080d81 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -25,13 +25,10 @@ __all__ = [ import os import re import time -import base64 import random import socket import datetime import urllib.parse -import warnings -from io import StringIO from email._parseaddr import quote from email._parseaddr import AddressList as _AddressList @@ -39,10 +36,7 @@ from email._parseaddr import mktime_tz from email._parseaddr import parsedate, parsedate_tz, _parsedate_tz -from quopri import decodestring as _qdecode - # Intrapackage imports -from email.encoders import _bencode, _qencode from email.charset import Charset COMMASPACE = ', ' @@ -54,17 +48,27 @@ TICK = "'" specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[\\"]') -# How to figure out if we are processing strings that come from a byte -# source with undecodable characters. -_has_surrogates = re.compile( - '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search +def _has_surrogates(s): + """Return True if s contains surrogate-escaped binary data.""" + # This check is based on the fact that unless there are surrogates, utf8 + # (Python's default encoding) can encode any string. This is the fastest + # way to check for surrogates, see issue 11454 for timings. + try: + s.encode() + return False + except UnicodeEncodeError: + return True # How to deal with a string containing bytes before handing it to the # application through the 'normal' interface. def _sanitize(string): - # Turn any escaped bytes into unicode 'unknown' char. - original_bytes = string.encode('ascii', 'surrogateescape') - return original_bytes.decode('ascii', 'replace') + # Turn any escaped bytes into unicode 'unknown' char. If the escaped + # bytes happen to be utf-8 they will instead get decoded, even if they + # were invalid in the charset the source was supposed to be in. This + # seems like it is not a bad thing; a defect was still registered. + original_bytes = string.encode('utf-8', 'surrogateescape') + return original_bytes.decode('utf-8', 'replace') + # Helpers @@ -151,30 +155,14 @@ def formatdate(timeval=None, localtime=False, usegmt=False): # 2822 requires that day and month names be the English abbreviations. if timeval is None: timeval = time.time() - if localtime: - now = time.localtime(timeval) - # Calculate timezone offset, based on whether the local zone has - # daylight savings time, and whether DST is in effect. - if time.daylight and now[-1]: - offset = time.altzone - else: - offset = time.timezone - hours, minutes = divmod(abs(offset), 3600) - # Remember offset is in seconds west of UTC, but the timezone is in - # minutes east of UTC, so the signs differ. - if offset > 0: - sign = '-' - else: - sign = '+' - zone = '%s%02d%02d' % (sign, hours, minutes // 60) + if localtime or usegmt: + dt = datetime.datetime.fromtimestamp(timeval, datetime.timezone.utc) else: - now = time.gmtime(timeval) - # Timezone offset is always -0000 - if usegmt: - zone = 'GMT' - else: - zone = '-0000' - return _format_timetuple_and_zone(now, zone) + dt = datetime.datetime.utcfromtimestamp(timeval) + if localtime: + dt = dt.astimezone() + usegmt = False + return format_datetime(dt, usegmt) def format_datetime(dt, usegmt=False): """Turn a datetime into a date string as specified in RFC 2822. @@ -198,24 +186,23 @@ def format_datetime(dt, usegmt=False): def make_msgid(idstring=None, domain=None): """Returns a string suitable for RFC 2822 compliant Message-ID, e.g: - <20020201195627.33539.96671@nightshade.la.mastaler.com> + <142480216486.20800.16526388040877946887@nightshade.la.mastaler.com> Optional idstring if given is a string used to strengthen the uniqueness of the message id. Optional domain if given provides the portion of the message id after the '@'. It defaults to the locally defined hostname. """ - timeval = time.time() - utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval)) + timeval = int(time.time()*100) pid = os.getpid() - randint = random.randrange(100000) + randint = random.getrandbits(64) if idstring is None: idstring = '' else: idstring = '.' + idstring if domain is None: domain = socket.getfqdn() - msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain) + msgid = '<%d.%d.%d%s@%s>' % (timeval, pid, randint, idstring, domain) return msgid |
