From c27e52265b7ff4aa57dc357c289cce8c9dd0fec3 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 25 May 2012 15:01:48 -0400 Subject: #14731: refactor email policy framework. This patch primarily does two things: (1) it adds some internal-interface methods to Policy that allow for Policy to control the parsing and folding of headers in such a way that we can construct a backward compatibility policy that is 100% compatible with the 3.2 API, while allowing a new policy to implement the email6 API. (2) it adds that backward compatibility policy and refactors the test suite so that the only differences between the 3.2 test_email.py file and the 3.3 test_email.py file is some small changes in test framework and the addition of tests for bugs fixed that apply to the 3.2 API. There are some additional teaks, such as moving just the code needed for the compatibility policy into _policybase, so that the library code can import only _policybase. That way the new code that will be added for email6 will only get imported when a non-compatibility policy is imported. --- Doc/library/email.generator.rst | 38 ++-- Doc/library/email.policy.rst | 290 ++++++++++++++++++++--------- Lib/email/_policybase.py | 338 ++++++++++++++++++++++++++++++++++ Lib/email/architecture.rst | 216 ++++++++++++++++++++++ Lib/email/feedparser.py | 24 ++- Lib/email/generator.py | 87 ++++----- Lib/email/message.py | 64 ++++--- Lib/email/parser.py | 4 +- Lib/email/policy.py | 172 +---------------- Lib/email/utils.py | 4 + Lib/test/test_email/__init__.py | 14 +- Lib/test/test_email/test_email.py | 180 ++++-------------- Lib/test/test_email/test_generator.py | 85 ++++++++- Lib/test/test_email/test_parser.py | 276 +++++++++++++++++++++++++++ Lib/test/test_email/test_policy.py | 114 +++++++++++- 15 files changed, 1389 insertions(+), 517 deletions(-) create mode 100644 Lib/email/_policybase.py create mode 100644 Lib/email/architecture.rst create mode 100644 Lib/test/test_email/test_parser.py diff --git a/Doc/library/email.generator.rst b/Doc/library/email.generator.rst index 03733ee..73440b8 100644 --- a/Doc/library/email.generator.rst +++ b/Doc/library/email.generator.rst @@ -32,8 +32,7 @@ Here are the public methods of the :class:`Generator` class, imported from the :mod:`email.generator` module: -.. class:: Generator(outfp, mangle_from_=True, maxheaderlen=78, *, \ - policy=policy.default) +.. class:: Generator(outfp, mangle_from_=True, maxheaderlen=78, *, policy=None) The constructor for the :class:`Generator` class takes a :term:`file-like object` called *outfp* for an argument. *outfp* must support the :meth:`write` method @@ -55,8 +54,9 @@ Here are the public methods of the :class:`Generator` class, imported from the The default is 78, as recommended (but not required) by :rfc:`2822`. The *policy* keyword specifies a :mod:`~email.policy` object that controls a - number of aspects of the generator's operation. The default policy - maintains backward compatibility. + number of aspects of the generator's operation. If no *policy* is specified, + then the *policy* attached to the message object passed to :attr:``flatten`` + is used. .. versionchanged:: 3.3 Added the *policy* keyword. @@ -80,19 +80,19 @@ Here are the public methods of the :class:`Generator` class, imported from the Optional *linesep* specifies the line separator character used to terminate lines in the output. If specified it overrides the value - specified by the ``Generator``\'s ``policy``. + specified by the *msg*\'s or ``Generator``\'s ``policy``. - Because strings cannot represent non-ASCII bytes, ``Generator`` ignores - the value of the :attr:`~email.policy.Policy.must_be_7bit` - :mod:`~email.policy` setting and operates as if it were set ``True``. - This means that messages parsed with a Bytes parser that have a - :mailheader:`Content-Transfer-Encoding` of 8bit will be converted to a - use a 7bit Content-Transfer-Encoding. Non-ASCII bytes in the headers - will be :rfc:`2047` encoded with a charset of `unknown-8bit`. + Because strings cannot represent non-ASCII bytes, if the policy that + applies when ``flatten`` is run has :attr:`~email.policy.Policy.cte_type` + set to ``8bit``, ``Generator`` will operate as if it were set to + ``7bit``. This means that messages parsed with a Bytes parser that have + a :mailheader:`Content-Transfer-Encoding` of ``8bit`` will be converted + to a use a ``7bit`` Content-Transfer-Encoding. Non-ASCII bytes in the + headers will be :rfc:`2047` encoded with a charset of ``unknown-8bit``. .. versionchanged:: 3.2 - Added support for re-encoding 8bit message bodies, and the *linesep* - argument. + Added support for re-encoding ``8bit`` message bodies, and the + *linesep* argument. .. method:: clone(fp) @@ -149,13 +149,13 @@ formatted string representation of a message object. For more detail, see at *msg* to the output file specified when the :class:`BytesGenerator` instance was created. Subparts are visited depth-first and the resulting text will be properly MIME encoded. If the :mod:`~email.policy` option - :attr:`~email.policy.Policy.must_be_7bit` is ``False`` (the default), + :attr:`~email.policy.Policy.cte_type` is ``8bit`` (the default), then any bytes with the high bit set in the original parsed message that have not been modified will be copied faithfully to the output. If - ``must_be_7bit`` is true, the bytes will be converted as needed using an - ASCII content-transfer-encoding. In particular, RFC-invalid non-ASCII - bytes in headers will be encoded using the MIME ``unknown-8bit`` - character set, thus rendering them RFC-compliant. + ``cte_type`` is ``7bit``, the bytes will be converted as needed + using an ASCII-compatible Content-Transfer-Encoding. In particular, + RFC-invalid non-ASCII bytes in headers will be encoded using the MIME + ``unknown-8bit`` character set, thus rendering them RFC-compliant. .. XXX: There should be a complementary option that just does the RFC compliance transformation but leaves CTE 8bit parts alone. diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index d9a292c..73cfba1 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -23,81 +23,100 @@ A :class:`Policy` object encapsulates a set of attributes and methods that control the behavior of various components of the email package during use. :class:`Policy` instances can be passed to various classes and methods in the email package to alter the default behavior. The settable values and their -defaults are described below. The :mod:`policy` module also provides some -pre-created :class:`Policy` instances. In addition to a :const:`default` -instance, there are instances tailored for certain applications. For example -there is an :const:`SMTP` :class:`Policy` with defaults appropriate for -generating output to be sent to an SMTP server. These are listed `below -`. - -In general an application will only need to deal with setting the policy at the -input and output boundaries. Once parsed, a message is represented by a -:class:`~email.message.Message` object, which is designed to be independent of -the format that the message has "on the wire" when it is received, transmitted, -or displayed. Thus, a :class:`Policy` can be specified when parsing a message -to create a :class:`~email.message.Message`, and again when turning the -:class:`~email.message.Message` into some other representation. While often a -program will use the same :class:`Policy` for both input and output, the two -can be different. +defaults are described below. + +There is a default policy used by all classes in the email package. This +policy is named :class:`Compat32`, with a corresponding pre-defined instance +named :const:`compat32`. It provides for complete backward compatibility (in +some cases, including bug compatibility) with the pre-Python3.3 version of the +email package. + +The first part of this documentation covers the features of :class:`Policy`, an +:term:`abstract base class` that defines the features that are common to all +policy objects, including :const:`compat32`. This includes certain hook +methods that are called internally by the email package, which a custom policy +could override to obtain different behavior. + +When a :class:`~email.message.Message` object is created, it acquires a policy. +By default this will be :const:`compat32`, but a different policy can be +specified. If the ``Message`` is created by a :mod:`~email.parser`, a policy +passed to the parser will be the policy used by the ``Message`` it creates. If +the ``Message`` is created by the program, then the policy can be specified +when it is created. When a ``Message`` is passed to a :mod:`~email.generator`, +the generator uses the policy from the ``Message`` by default, but you can also +pass a specific policy to the generator that will override the one stored on +the ``Message`` object. + +:class:`Policy` instances are immutable, but they can be cloned, accepting the +same keyword arguments as the class constructor and returning a new +:class:`Policy` instance that is a copy of the original but with the specified +attributes values changed. As an example, the following code could be used to read an email message from a file on disk and pass it to the system ``sendmail`` program on a Unix system:: >>> from email import msg_from_binary_file >>> from email.generator import BytesGenerator - >>> import email.policy >>> from subprocess import Popen, PIPE >>> with open('mymsg.txt', 'b') as f: - ... msg = msg_from_binary_file(f, policy=email.policy.mbox) + ... msg = msg_from_binary_file(f) >>> p = Popen(['sendmail', msg['To'][0].address], stdin=PIPE) - >>> g = BytesGenerator(p.stdin, policy=email.policy.SMTP) + >>> g = BytesGenerator(p.stdin, policy=msg.policy.clone(linesep='\r\n')) >>> g.flatten(msg) >>> p.stdin.close() >>> rc = p.wait() -.. XXX email.policy.mbox/MBOX does not exist yet +Here we are telling :class:`~email.generator.BytesGenerator` to use the RFC +correct line separator characters when creating the binary string to feed into +``sendmail's`` ``stdin``, where the default policy would use ``\n`` line +separators. Some email package methods accept a *policy* keyword argument, allowing the policy to be overridden for that method. For example, the following code uses -the :meth:`~email.message.Message.as_string` method of the *msg* object from the -previous example and re-write it to a file using the native line separators for -the platform on which it is running:: +the :meth:`~email.message.Message.as_string` method of the *msg* object from +the previous example and writes the message to a file using the native line +separators for the platform on which it is running:: >>> import os - >>> mypolicy = email.policy.Policy(linesep=os.linesep) >>> with open('converted.txt', 'wb') as f: - ... f.write(msg.as_string(policy=mypolicy)) - -Policy instances are immutable, but they can be cloned, accepting the same -keyword arguments as the class constructor and returning a new :class:`Policy` -instance that is a copy of the original but with the specified attributes -values changed. For example, the following creates an SMTP policy that will -raise any defects detected as errors:: - - >>> strict_SMTP = email.policy.SMTP.clone(raise_on_defect=True) + ... f.write(msg.as_string(policy=msg.policy.clone(linesep=os.linesep)) Policy objects can also be combined using the addition operator, producing a policy object whose settings are a combination of the non-default values of the summed objects:: - >>> strict_SMTP = email.policy.SMTP + email.policy.strict + >>> compat_SMTP = email.policy.clone(linesep='\r\n') + >>> compat_strict = email.policy.clone(raise_on_defect=True) + >>> compat_strict_SMTP = compat_SMTP + compat_strict This operation is not commutative; that is, the order in which the objects are added matters. To illustrate:: - >>> Policy = email.policy.Policy - >>> apolicy = Policy(max_line_length=100) + Policy(max_line_length=80) + >>> policy100 = compat32.clone(max_line_length=100) + >>> policy80 = compat32.clone(max_line_length=80) + >>> apolicy = policy100 + Policy80 >>> apolicy.max_line_length 80 - >>> apolicy = Policy(max_line_length=80) + Policy(max_line_length=100) + >>> apolicy = policy80 + policy100 >>> apolicy.max_line_length 100 .. class:: Policy(**kw) - The valid constructor keyword arguments are any of the attributes listed - below. + This is the :term:`abstract base class` for all policy classes. It provides + default implementations for a couple of trivial methods, as well as the + implementation of the immutability property, the :meth:`clone` method, and + the constructor semantics. + + The constructor of a policy class can be passed various keyword arguments. + The arguments that may be specified are any non-method properties on this + class, plus any additional non-method properties on the concrete class. A + value specified in the constructor will override the default value for the + corresponding attribute. + + This class defines the following properties, and thus values for the + following may be passed in the constructor of any policy class: .. attribute:: max_line_length @@ -110,18 +129,28 @@ added matters. To illustrate:: The string to be used to terminate lines in serialized output. The default is ``\n`` because that's the internal end-of-line discipline used - by Python, though ``\r\n`` is required by the RFCs. See `Policy - Instances`_ for policies that use an RFC conformant linesep. Setting it - to :attr:`os.linesep` may also be useful. + by Python, though ``\r\n`` is required by the RFCs. + + .. attribute:: cte_type - .. attribute:: must_be_7bit + Controls the type of Content Transfer Encodings that may be or are + required to be used. The possible values are: - If ``True``, data output by a bytes generator is limited to ASCII - characters. If :const:`False` (the default), then bytes with the high - bit set are preserved and/or allowed in certain contexts (for example, - where possible a content transfer encoding of ``8bit`` will be used). - String generators act as if ``must_be_7bit`` is ``True`` regardless of - the policy in effect, since a string cannot represent non-ASCII bytes. + ======== =============================================================== + ``7bit`` all data must be "7 bit clean" (ASCII-only). This means that + where necessary data will be encoded using either + quoted-printable or base64 encoding. + + ``8bit`` data is not constrained to be 7 bit clean. Data in headers is + still required to be ASCII-only and so will be encoded (see + 'binary_fold' below for an exception), but body parts may use + the ``8bit`` CTE. + ======== =============================================================== + + A ``cte_type`` value of ``8bit`` only works with ``BytesGenerator``, not + ``Generator``, because strings cannot contain binary data. If a + ``Generator`` is operating under a policy that specifies + ``cte_type=8bit``, it will act as if ``cte_type`` is ``7bit``. .. attribute:: raise_on_defect @@ -129,56 +158,151 @@ added matters. To illustrate:: :const:`False` (the default), defects will be passed to the :meth:`register_defect` method. - :mod:`Policy` object also have the following methods: + The following :class:`Policy` method is intended to be called by code using + the email library to create policy instances with custom settings: + + .. method:: clone(**kw) + + Return a new :class:`Policy` instance whose attributes have the same + values as the current instance, except where those attributes are + given new values by the keyword arguments. + + The remaining :class:`Policy` methods are called by the email package code, + and are not intended to be called by an application using the email package. + A custom policy must implement all of these methods. .. method:: handle_defect(obj, defect) - *obj* is the object on which to register the defect. *defect* should be - an instance of a subclass of :class:`~email.errors.Defect`. - If :attr:`raise_on_defect` - is ``True`` the defect is raised as an exception. Otherwise *obj* and - *defect* are passed to :meth:`register_defect`. This method is intended - to be called by parsers when they encounter defects, and will not be - called by code that uses the email library unless that code is - implementing an alternate parser. + Handle a *defect* found on *obj*. When the email package calls this + method, *defect* will always be a subclass of + :class:`~email.errors.Defect`. + + The default implementation checks the :attr:`raise_on_defect` flag. If + it is ``True``, *defect* is raised as an exception. If it is ``False`` + (the default), *obj* and *defect* are passed to :meth:`register_defect`. .. method:: register_defect(obj, defect) - *obj* is the object on which to register the defect. *defect* should be - a subclass of :class:`~email.errors.Defect`. This method is part of the - public API so that custom ``Policy`` subclasses can implement alternate - handling of defects. The default implementation calls the ``append`` - method of the ``defects`` attribute of *obj*. + Register a *defect* on *obj*. In the email package, *defect* will always + be a subclass of :class:`~email.errors.Defect`. - .. method:: clone(obj, *kw) + The default implementation calls the ``append`` method of the ``defects`` + attribute of *obj*. When the email package calls :attr:`handle_defect`, + *obj* will normally have a ``defects`` attribute that has an ``append`` + method. Custom object types used with the email package (for example, + custom ``Message`` objects) should also provide such an attribute, + otherwise defects in parsed messages will raise unexpected errors. - Return a new :class:`Policy` instance whose attributes have the same - values as the current instance, except where those attributes are - given new values by the keyword arguments. + .. method:: header_source_parse(sourcelines) + + The email package calls this method with a list of strings, each string + ending with the line separation characters found in the source being + parsed. The first line includes the field header name and separator. + All whitespace in the source is preserved. The method should return the + ``(name, value)`` tuple that is to be stored in the ``Message`` to + represent the parsed header. + + If an implementation wishes to retain compatibility with the existing + email package policies, *name* should be the case preserved name (all + characters up to the '``:``' separator), while *value* should be the + unfolded value (all line separator characters removed, but whitespace + kept intact), stripped of leading whitespace. + + *sourcelines* may contain surrogateescaped binary data. + + There is no default implementation + + .. method:: header_store_parse(name, value) + + The email package calls this method with the name and value provided by + the application program when the application program is modifying a + ``Message`` programmatically (as opposed to a ``Message`` created by a + parser). The method should return the ``(name, value)`` tuple that is to + be stored in the ``Message`` to represent the header. + + If an implementation wishes to retain compatibility with the existing + email package policies, the *name* and *value* should be strings or + string subclasses that do not change the content of the passed in + arguments. + + There is no default implementation + + .. method:: header_fetch_parse(name, value) + + The email package calls this method with the *name* and *value* currently + stored in the ``Message`` when that header is requested by the + application program, and whatever the method returns is what is passed + back to the application as the value of the header being retrieved. + Note that there may be more than one header with the same name stored in + the ``Message``; the method is passed the specific name and value of the + header destined to be returned to the application. + + *value* may contain surrogateescaped binary data. There should be no + surrogateescaped binary data in the value returned by the method. + + There is no default implementation + + .. method:: fold(name, value) + + The email package calls this method with the *name* and *value* currently + stored in the ``Message`` for a given header. The method should return a + string that represents that header "folded" correctly (according to the + policy settings) by composing the *name* with the *value* and inserting + :attr:`linesep` characters at the appropriate places. See :rfc:`5322` + for a discussion of the rules for folding email headers. + + *value* may contain surrogateescaped binary data. There should be no + surrogateescaped binary data in the string returned by the method. + + .. method:: fold_binary(name, value) + + The same as :meth:`fold`, except that the returned value should be a + bytes object rather than a string. + + *value* may contain surrogateescaped binary data. These could be + converted back into binary data in the returned bytes object. + + +.. class:: Compat32(**kw) + + This concrete :class:`Policy` is the backward compatibility policy. It + replicates the behavior of the email package in Python 3.2. The + :mod:`policy` module also defines an instance of this class, + :const:`compat32`, that is used as the default policy. Thus the default + behavior of the email package is to maintain compatibility with Python 3.2. + The class provides the following concrete implementations of the + abstract methods of :class:`Policy`: -Policy Instances -^^^^^^^^^^^^^^^^ + .. method:: header_source_parse(sourcelines) -The following instances of :class:`Policy` provide defaults suitable for -specific common application domains. + The name is parsed as everything up to the '``:``' and returned + unmodified. The value is determined by stripping leading whitespace off + the remainder of the first line, joining all subsequent lines together, + and stripping any trailing carriage return or linefeed characters. -.. data:: default + .. method:: header_store_parse(name, value) - An instance of :class:`Policy` with all defaults unchanged. + The name and value are returned unmodified. -.. data:: SMTP + .. method:: header_fetch_parse(name, value) - Output serialized from a message will conform to the email and SMTP - RFCs. The only changed attribute is :attr:`linesep`, which is set to - ``\r\n``. + If the value contains binary data, it is converted into a + :class:`~email.header.Header` object using the ``unknown-8bit`` charset. + Otherwise it is returned unmodified. -.. data:: HTTP + .. method:: fold(name, value) - Suitable for use when serializing headers for use in HTTP traffic. - :attr:`linesep` is set to ``\r\n``, and :attr:`max_line_length` is set to - :const:`None` (unlimited). + Headers are folded using the :class:`~email.header.Header` folding + algorithm, which preserves existing line breaks in the value, and wraps + each resulting line to the ``max_line_length``. Non-ASCII binary data are + CTE encoded using the ``unknown-8bit`` charset. -.. data:: strict + .. method:: fold_binary(name, value) - :attr:`raise_on_defect` is set to :const:`True`. + Headers are folded using the :class:`~email.header.Header` folding + algorithm, which preserves existing line breaks in the value, and wraps + each resulting line to the ``max_line_length``. If ``cte_type`` is + ``7bit``, non-ascii binary data is CTE encoded using the ``unknown-8bit`` + charset. Otherwise the original source header is used, with its existing + line breaks and and any (RFC invalid) binary data it may contain. diff --git a/Lib/email/_policybase.py b/Lib/email/_policybase.py new file mode 100644 index 0000000..05736d0 --- /dev/null +++ b/Lib/email/_policybase.py @@ -0,0 +1,338 @@ +"""Policy framework for the email package. + +Allows fine grained feature control of how the package parses and emits data. +""" + +import abc +from email import header +from email import charset as _charset +from email.utils import _has_surrogates + +__all__ = [ + 'Policy', + 'Compat32', + 'compat32', + ] + + +class _PolicyBase: + + """Policy Object basic framework. + + This class is useless unless subclassed. A subclass should define + class attributes with defaults for any values that are to be + managed by the Policy object. The constructor will then allow + non-default values to be set for these attributes at instance + creation time. The instance will be callable, taking these same + attributes keyword arguments, and returning a new instance + identical to the called instance except for those values changed + by the keyword arguments. Instances may be added, yielding new + instances with any non-default values from the right hand + operand overriding those in the left hand operand. That is, + + A + B == A() + + The repr of an instance can be used to reconstruct the object + if and only if the repr of the values can be used to reconstruct + those values. + + """ + + def __init__(self, **kw): + """Create new Policy, possibly overriding some defaults. + + See class docstring for a list of overridable attributes. + + """ + for name, value in kw.items(): + if hasattr(self, name): + super(_PolicyBase,self).__setattr__(name, value) + else: + raise TypeError( + "{!r} is an invalid keyword argument for {}".format( + name, self.__class__.__name__)) + + def __repr__(self): + args = [ "{}={!r}".format(name, value) + for name, value in self.__dict__.items() ] + return "{}({})".format(self.__class__.__name__, ', '.join(args)) + + def clone(self, **kw): + """Return a new instance with specified attributes changed. + + The new instance has the same attribute values as the current object, + except for the changes passed in as keyword arguments. + + """ + for attr, value in self.__dict__.items(): + if attr not in kw: + kw[attr] = value + return self.__class__(**kw) + + def __setattr__(self, name, value): + if hasattr(self, name): + msg = "{!r} object attribute {!r} is read-only" + else: + msg = "{!r} object has no attribute {!r}" + raise AttributeError(msg.format(self.__class__.__name__, name)) + + def __add__(self, other): + """Non-default values from right operand override those from left. + + The object returned is a new instance of the subclass. + + """ + return self.clone(**other.__dict__) + + +# Conceptually this isn't a subclass of ABCMeta, but since we want Policy to +# use ABCMeta as a metaclass *and* we want it to use this one as well, we have +# to make this one a subclas of ABCMeta. +class _DocstringExtenderMetaclass(abc.ABCMeta): + + def __new__(meta, classname, bases, classdict): + if classdict.get('__doc__') and classdict['__doc__'].startswith('+'): + classdict['__doc__'] = meta._append_doc(bases[0].__doc__, + classdict['__doc__']) + for name, attr in classdict.items(): + if attr.__doc__ and attr.__doc__.startswith('+'): + for cls in (cls for base in bases for cls in base.mro()): + doc = getattr(getattr(cls, name), '__doc__') + if doc: + attr.__doc__ = meta._append_doc(doc, attr.__doc__) + break + return super().__new__(meta, classname, bases, classdict) + + @staticmethod + def _append_doc(doc, added_doc): + added_doc = added_doc.split('\n', 1)[1] + return doc + '\n' + added_doc + + +class Policy(_PolicyBase, metaclass=_DocstringExtenderMetaclass): + + r"""Controls for how messages are interpreted and formatted. + + Most of the classes and many of the methods in the email package accept + Policy objects as parameters. A Policy object contains a set of values and + functions that control how input is interpreted and how output is rendered. + For example, the parameter 'raise_on_defect' controls whether or not an RFC + violation results in an error being raised or not, while 'max_line_length' + controls the maximum length of output lines when a Message is serialized. + + Any valid attribute may be overridden when a Policy is created by passing + it as a keyword argument to the constructor. Policy objects are immutable, + but a new Policy object can be created with only certain values changed by + calling the Policy instance with keyword arguments. Policy objects can + also be added, producing a new Policy object in which the non-default + attributes set in the right hand operand overwrite those specified in the + left operand. + + Settable attributes: + + raise_on_defect -- If true, then defects should be raised as errors. + Default: False. + + linesep -- string containing the value to use as separation + between output lines. Default '\n'. + + cte_type -- Type of allowed content transfer encodings + + 7bit -- ASCII only + 8bit -- Content-Transfer-Encoding: 8bit is allowed + + Default: 8bit. Also controls the disposition of + (RFC invalid) binary data in headers; see the + documentation of the binary_fold method. + + max_line_length -- maximum length of lines, excluding 'linesep', + during serialization. None or 0 means no line + wrapping is done. Default is 78. + + """ + + raise_on_defect = False + linesep = '\n' + cte_type = '8bit' + max_line_length = 78 + + def handle_defect(self, obj, defect): + """Based on policy, either raise defect or call register_defect. + + handle_defect(obj, defect) + + defect should be a Defect subclass, but in any case must be an + Exception subclass. obj is the object on which the defect should be + registered if it is not raised. If the raise_on_defect is True, the + defect is raised as an error, otherwise the object and the defect are + passed to register_defect. + + This method is intended to be called by parsers that discover defects. + The email package parsers always call it with Defect instances. + + """ + if self.raise_on_defect: + raise defect + self.register_defect(obj, defect) + + def register_defect(self, obj, defect): + """Record 'defect' on 'obj'. + + Called by handle_defect if raise_on_defect is False. This method is + part of the Policy API so that Policy subclasses can implement custom + defect handling. The default implementation calls the append method of + the defects attribute of obj. The objects used by the email package by + default that get passed to this method will always have a defects + attribute with an append method. + + """ + obj.defects.append(defect) + + @abc.abstractmethod + def header_source_parse(self, sourcelines): + """Given a list of linesep terminated strings constituting the lines of + a single header, return the (name, value) tuple that should be stored + in the model. The input lines should retain their terminating linesep + characters. The lines passed in by the email package may contain + surrogateescaped binary data. + """ + raise NotImplementedError + + @abc.abstractmethod + def header_store_parse(self, name, value): + """Given the header name and the value provided by the application + program, return the (name, value) that should be stored in the model. + """ + raise NotImplementedError + + @abc.abstractmethod + def header_fetch_parse(self, name, value): + """Given the header name and the value from the model, return the value + to be returned to the application program that is requesting that + header. The value passed in by the email package may contain + surrogateescaped binary data if the lines were parsed by a BytesParser. + The returned value should not contain any surrogateescaped data. + + """ + raise NotImplementedError + + @abc.abstractmethod + def fold(self, name, value): + """Given the header name and the value from the model, return a string + containing linesep characters that implement the folding of the header + according to the policy controls. The value passed in by the email + package may contain surrogateescaped binary data if the lines were + parsed by a BytesParser. The returned value should not contain any + surrogateescaped data. + + """ + raise NotImplementedError + + @abc.abstractmethod + def fold_binary(self, name, value): + """Given the header name and the value from the model, return binary + data containing linesep characters that implement the folding of the + header according to the policy controls. The value passed in by the + email package may contain surrogateescaped binary data. + + """ + raise NotImplementedError + + +class Compat32(Policy): + + """+ + This particular policy is the backward compatibility Policy. It + replicates the behavior of the email package version 5.1. + """ + + def _sanitize_header(self, name, value): + # If the header value contains surrogates, return a Header using + # the unknown-8bit charset to encode the bytes as encoded words. + if not isinstance(value, str): + # Assume it is already a header object + return value + if _has_surrogates(value): + return header.Header(value, charset=_charset.UNKNOWN8BIT, + header_name=name) + else: + return value + + def header_source_parse(self, sourcelines): + """+ + The name is parsed as everything up to the ':' and returned unmodified. + The value is determined by stripping leading whitespace off the + remainder of the first line, joining all subsequent lines together, and + stripping any trailing carriage return or linefeed characters. + + """ + name, value = sourcelines[0].split(':', 1) + value = value.lstrip(' \t') + ''.join(sourcelines[1:]) + return (name, value.rstrip('\r\n')) + + def header_store_parse(self, name, value): + """+ + The name and value are returned unmodified. + """ + return (name, value) + + def header_fetch_parse(self, name, value): + """+ + If the value contains binary data, it is converted into a Header object + using the unknown-8bit charset. Otherwise it is returned unmodified. + """ + return self._sanitize_header(name, value) + + def fold(self, name, value): + """+ + Headers are folded using the Header folding algorithm, which preserves + existing line breaks in the value, and wraps each resulting line to the + max_line_length. Non-ASCII binary data are CTE encoded using the + unknown-8bit charset. + + """ + return self._fold(name, value, sanitize=True) + + def fold_binary(self, name, value): + """+ + Headers are folded using the Header folding algorithm, which preserves + existing line breaks in the value, and wraps each resulting line to the + max_line_length. If cte_type is 7bit, non-ascii binary data is CTE + encoded using the unknown-8bit charset. Otherwise the original source + header is used, with its existing line breaks and/or binary data. + + """ + folded = self._fold(name, value, sanitize=self.cte_type=='7bit') + return folded.encode('ascii', 'surrogateescape') + + def _fold(self, name, value, sanitize): + parts = [] + parts.append('%s: ' % name) + if isinstance(value, str): + if _has_surrogates(value): + if sanitize: + h = header.Header(value, + charset=_charset.UNKNOWN8BIT, + header_name=name) + else: + # If we have raw 8bit data in a byte string, we have no idea + # what the encoding is. There is no safe way to split this + # string. If it's ascii-subset, then we could do a normal + # ascii split, but if it's multibyte then we could break the + # string. There's no way to know so the least harm seems to + # be to not split the string and risk it being too long. + parts.append(value) + h = None + else: + h = header.Header(value, header_name=name) + else: + # Assume it is a Header-like object. + h = value + if h is not None: + parts.append(h.encode(linesep=self.linesep, + maxlinelen=self.max_line_length)) + parts.append(self.linesep) + return ''.join(parts) + + +compat32 = Compat32() diff --git a/Lib/email/architecture.rst b/Lib/email/architecture.rst new file mode 100644 index 0000000..80d24fe --- /dev/null +++ b/Lib/email/architecture.rst @@ -0,0 +1,216 @@ +:mod:`email` Package Architecture +================================= + +Overview +-------- + +The email package consists of three major components: + + Model + An object structure that represents an email message, and provides an + API for creating, querying, and modifying a message. + + Parser + Takes a sequence of characters or bytes and produces a model of the + email message represented by those characters or bytes. + + Generator + Takes a model and turns it into a sequence of characters or bytes. The + sequence can either be intended for human consumption (a printable + unicode string) or bytes suitable for transmission over the wire. In + the latter case all data is properly encoded using the content transfer + encodings specified by the relevant RFCs. + +Conceptually the package is organized around the model. The model provides both +"external" APIs intended for use by application programs using the library, +and "internal" APIs intended for use by the Parser and Generator components. +This division is intentionally a bit fuzy; the API described by this documentation +is all a public, stable API. This allows for an application with special needs +to implement its own parser and/or generator. + +In addition to the three major functional components, there is a third key +component to the architecture: + + Policy + An object that specifies various behavioral settings and carries + implementations of various behavior-controlling methods. + +The Policy framework provides a simple and convenient way to control the +behavior of the library, making it possible for the library to be used in a +very flexible fashion while leveraging the common code required to parse, +represent, and generate message-like objects. For example, in addition to the +default :rfc:`5322` email message policy, we also have a policy that manages +HTTP headers in a fashion compliant with :rfc:`2616`. Individual policy +controls, such as the maximum line length produced by the generator, can also +be controlled individually to meet specialized application requirements. + + +The Model +--------- + +The message model is implemented by the :class:`~email.message.Message` class. +The model divides a message into the two fundamental parts discussed by the +RFC: the header section and the body. The `Message` object acts as a +pseudo-dictionary of named headers. Its dictionary interface provides +convenient access to individual headers by name. However, all headers are kept +internally in an ordered list, so that the information about the order of the +headers in the original message is preserved. + +The `Message` object also has a `payload` that holds the body. A `payload` can +be one of two things: data, or a list of `Message` objects. The latter is used +to represent a multipart MIME message. Lists can be nested arbitrarily deeply +in order to represent the message, with all terminal leaves having non-list +data payloads. + + +Message Lifecycle +----------------- + +The general lifecyle of a message is: + + Creation + A `Message` object can be created by a Parser, or it can be + instantiated as an empty message by an application. + + Manipulation + The application may examine one or more headers, and/or the + payload, and it may modify one or more headers and/or + the payload. This may be done on the top level `Message` + object, or on any sub-object. + + Finalization + The Model is converted into a unicode or binary stream, + or the model is discarded. + + + +Header Policy Control During Lifecycle +-------------------------------------- + +One of the major controls exerted by the Policy is the management of headers +during the `Message` lifecycle. Most applications don't need to be aware of +this. + +A header enters the model in one of two ways: via a Parser, or by being set to +a specific value by an application program after the Model already exists. +Similarly, a header exits the model in one of two ways: by being serialized by +a Generator, or by being retrieved from a Model by an application program. The +Policy object provides hooks for all four of these pathways. + +The model storage for headers is a list of (name, value) tuples. + +The Parser identifies headers during parsing, and passes them to the +:meth:`~email.policy.Policy.header_source_parse` method of the Policy. The +result of that method is the (name, value) tuple to be stored in the model. + +When an application program supplies a header value (for example, through the +`Message` object `__setitem__` interface), the name and the value are passed to +the :meth:`~email.policy.Policy.header_store_parse` method of the Policy, which +returns the (name, value) tuple to be stored in the model. + +When an application program retrieves a header (through any of the dict or list +interfaces of `Message`), the name and value are passed to the +:meth:`~email.policy.Policy.header_fetch_parse` method of the Policy to +obtain the value returned to the application. + +When a Generator requests a header during serialization, the name and value are +passed to the :meth:`~email.policy.Policy.fold` method of the Policy, which +returns a string containing line breaks in the appropriate places. The +:meth:`~email.policy.Policy.cte_type` Policy control determines whether or +not Content Transfer Encoding is performed on the data in the header. There is +also a :meth:`~email.policy.Policy.binary_fold` method for use by generators +that produce binary output, which returns the folded header as binary data, +possibly folded at different places than the corresponding string would be. + + +Handling Binary Data +-------------------- + +In an ideal world all message data would conform to the RFCs, meaning that the +parser could decode the message into the idealized unicode message that the +sender originally wrote. In the real world, the email package must also be +able to deal with badly formatted messages, including messages containing +non-ASCII characters that either have no indicated character set or are not +valid characters in the indicated character set. + +Since email messages are *primarily* text data, and operations on message data +are primarily text operations (except for binary payloads of course), the model +stores all text data as unicode strings. Un-decodable binary inside text +data is handled by using the `surrogateescape` error handler of the ASCII +codec. As with the binary filenames the error handler was introduced to +handle, this allows the email package to "carry" the binary data received +during parsing along until the output stage, at which time it is regenerated +in its original form. + +This carried binary data is almost entirely an implementation detail. The one +place where it is visible in the API is in the "internal" API. A Parser must +do the `surrogateescape` encoding of binary input data, and pass that data to +the appropriate Policy method. The "internal" interface used by the Generator +to access header values preserves the `surrogateescaped` bytes. All other +interfaces convert the binary data either back into bytes or into a safe form +(losing information in some cases). + + +Backward Compatibility +---------------------- + +The :class:`~email.policy.Policy.Compat32` Policy provides backward +compatibility with version 5.1 of the email package. It does this via the +following implementation of the four+1 Policy methods described above: + +header_source_parse + Splits the first line on the colon to obtain the name, discards any spaces + after the colon, and joins the remainder of the line with all of the + remaining lines, preserving the linesep characters to obtain the value. + Trailing carriage return and/or linefeed characters are stripped from the + resulting value string. + +header_store_parse + Returns the name and value exactly as received from the application. + +header_fetch_parse + If the value contains any `surrogateescaped` binary data, return the value + as a :class:`~email.header.Header` object, using the character set + `unknown-8bit`. Otherwise just returns the value. + +fold + Uses :class:`~email.header.Header`'s folding to fold headers in the + same way the email5.1 generator did. + +binary_fold + Same as fold, but encodes to 'ascii'. + + +New Algorithm +------------- + +header_source_parse + Same as legacy behavior. + +header_store_parse + Same as legacy behavior. + +header_fetch_parse + If the value is already a header object, returns it. Otherwise, parses the + value using the new parser, and returns the resulting object as the value. + `surrogateescaped` bytes get turned into unicode unknown character code + points. + +fold + Uses the new header folding algorithm, respecting the policy settings. + surrogateescaped bytes are encoded using the ``unknown-8bit`` charset for + ``cte_type=7bit`` or ``8bit``. Returns a string. + + At some point there will also be a ``cte_type=unicode``, and for that + policy fold will serialize the idealized unicode message with RFC-like + folding, converting any surrogateescaped bytes into the unicode + unknown character glyph. + +binary_fold + Uses the new header folding algorithm, respecting the policy settings. + surrogateescaped bytes are encoded using the `unknown-8bit` charset for + ``cte_type=7bit``, and get turned back into bytes for ``cte_type=8bit``. + Returns bytes. + + At some point there will also be a ``cte_type=unicode``, and for that + policy binary_fold will serialize the message according to :rfc:``5335``. diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 533ebdf..0706cae 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -25,7 +25,7 @@ import re from email import errors from email import message -from email import policy +from email._policybase import compat32 NLCRE = re.compile('\r\n|\r|\n') NLCRE_bol = re.compile('(\r\n|\r|\n)') @@ -135,7 +135,7 @@ class BufferedSubFile(object): class FeedParser: """A feed-style parser of email.""" - def __init__(self, _factory=message.Message, *, policy=policy.default): + def __init__(self, _factory=message.Message, *, policy=compat32): """_factory is called with no arguments to create a new message obj The policy keyword specifies a policy object that controls a number of @@ -145,6 +145,12 @@ class FeedParser: """ self._factory = _factory self.policy = policy + try: + _factory(policy=self.policy) + self._factory_kwds = lambda: {'policy': self.policy} + except TypeError: + # Assume this is an old-style factory + self._factory_kwds = lambda: {} self._input = BufferedSubFile() self._msgstack = [] self._parse = self._parsegen().__next__ @@ -181,7 +187,7 @@ class FeedParser: return root def _new_message(self): - msg = self._factory() + msg = self._factory(**self._factory_kwds()) if self._cur and self._cur.get_content_type() == 'multipart/digest': msg.set_default_type('message/rfc822') if self._msgstack: @@ -458,9 +464,7 @@ class FeedParser: lastvalue.append(line) continue if lastheader: - # XXX reconsider the joining of folded lines - lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') - self._cur[lastheader] = lhdr + self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) lastheader, lastvalue = '', [] # Check for envelope header, i.e. unix-from if line.startswith('From '): @@ -487,16 +491,16 @@ class FeedParser: i = line.find(':') if i < 0: defect = errors.MalformedHeaderDefect(line) + # XXX: fixme (defect not going through policy) self._cur.defects.append(defect) continue lastheader = line[:i] - lastvalue = [line[i+1:].lstrip()] + lastvalue = [line] # Done with all the lines, so handle the last header. if lastheader: - # XXX reconsider the joining of folded lines - self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') + self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) + - class BytesFeedParser(FeedParser): """Like FeedParser, but feed accepts bytes.""" diff --git a/Lib/email/generator.py b/Lib/email/generator.py index dcfea4c..bfa288b 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -13,9 +13,9 @@ import random import warnings from io import StringIO, BytesIO -from email import policy +from email._policybase import compat32 from email.header import Header -from email.message import _has_surrogates +from email.utils import _has_surrogates import email.charset as _charset UNDERSCORE = '_' @@ -36,7 +36,7 @@ class Generator: # def __init__(self, outfp, mangle_from_=True, maxheaderlen=None, *, - policy=policy.default): + policy=None): """Create the generator for message flattening. outfp is the output file-like object for writing the message to. It @@ -60,8 +60,7 @@ class Generator: """ self._fp = outfp self._mangle_from_ = mangle_from_ - self._maxheaderlen = (maxheaderlen if maxheaderlen is not None else - policy.max_line_length) + self.maxheaderlen = maxheaderlen self.policy = policy def write(self, s): @@ -87,20 +86,33 @@ class Generator: # from the msg, and _encoded_XXX constants for operating on data that # has already been converted (to bytes in the BytesGenerator) and # inserted into a temporary buffer. - self._NL = linesep if linesep is not None else self.policy.linesep + policy = msg.policy if self.policy is None else self.policy + if linesep is not None: + policy = policy.clone(linesep=linesep) + if self.maxheaderlen is not None: + policy = policy.clone(max_line_length=self.maxheaderlen) + self._NL = policy.linesep self._encoded_NL = self._encode(self._NL) self._EMPTY = '' self._encoded_EMTPY = self._encode('') - if unixfrom: - ufrom = msg.get_unixfrom() - if not ufrom: - ufrom = 'From nobody ' + time.ctime(time.time()) - self.write(ufrom + self._NL) - self._write(msg) + p = self.policy + try: + self.policy = policy + if unixfrom: + ufrom = msg.get_unixfrom() + if not ufrom: + ufrom = 'From nobody ' + time.ctime(time.time()) + self.write(ufrom + self._NL) + self._write(msg) + finally: + self.policy = p def clone(self, fp): """Clone this generator with the exact same options.""" - return self.__class__(fp, self._mangle_from_, self._maxheaderlen) + return self.__class__(fp, + self._mangle_from_, + None, # Use policy setting, which we've adjusted + policy=self.policy) # # Protected interface - undocumented ;/ @@ -175,16 +187,8 @@ class Generator: # def _write_headers(self, msg): - for h, v in msg.items(): - self.write('%s: ' % h) - if isinstance(v, Header): - self.write(v.encode( - maxlinelen=self._maxheaderlen, linesep=self._NL)+self._NL) - else: - # Header's got lots of smarts, so use it. - header = Header(v, maxlinelen=self._maxheaderlen, - header_name=h) - self.write(header.encode(linesep=self._NL)+self._NL) + for h, v in msg.raw_items(): + self.write(self.policy.fold(h, v)) # A blank line always separates headers from body self.write(self._NL) @@ -265,12 +269,12 @@ class Generator: # The contents of signed parts has to stay unmodified in order to keep # the signature intact per RFC1847 2.1, so we disable header wrapping. # RDM: This isn't enough to completely preserve the part, but it helps. - old_maxheaderlen = self._maxheaderlen + p = self.policy + self.policy = p.clone(max_line_length=0) try: - self._maxheaderlen = 0 self._handle_multipart(msg) finally: - self._maxheaderlen = old_maxheaderlen + self.policy = p def _handle_message_delivery_status(self, msg): # We can't just write the headers directly to self's file object @@ -347,9 +351,9 @@ class BytesGenerator(Generator): Functionally identical to the base Generator except that the output is bytes and not string. When surrogates were used in the input to encode bytes, these are decoded back to bytes for output. If the policy has - must_be_7bit set true, then the message is transformed such that the - non-ASCII bytes are properly content transfer encoded, using the - charset unknown-8bit. + cte_type set to 7bit, then the message is transformed such that the + non-ASCII bytes are properly content transfer encoded, using the charset + unknown-8bit. The outfp object must accept bytes in its write method. """ @@ -370,27 +374,8 @@ class BytesGenerator(Generator): def _write_headers(self, msg): # This is almost the same as the string version, except for handling # strings with 8bit bytes. - for h, v in msg._headers: - self.write('%s: ' % h) - if isinstance(v, str): - if _has_surrogates(v): - if not self.policy.must_be_7bit: - # If we have raw 8bit data in a byte string, we have no idea - # what the encoding is. There is no safe way to split this - # string. If it's ascii-subset, then we could do a normal - # ascii split, but if it's multibyte then we could break the - # string. There's no way to know so the least harm seems to - # be to not split the string and risk it being too long. - self.write(v+NL) - continue - h = Header(v, charset=_charset.UNKNOWN8BIT, header_name=h) - else: - h = Header(v, header_name=h) - else: - # Assume it is a Header-like object. - h = v - self.write(h.encode(linesep=self._NL, - maxlinelen=self._maxheaderlen)+self._NL) + for h, v in msg.raw_items(): + self._fp.write(self.policy.fold_binary(h, v)) # A blank line always separates headers from body self.write(self._NL) @@ -399,7 +384,7 @@ class BytesGenerator(Generator): # just write it back out. if msg._payload is None: return - if _has_surrogates(msg._payload) and not self.policy.must_be_7bit: + if _has_surrogates(msg._payload) and not self.policy.cte_type=='7bit': self.write(msg._payload) else: super(BytesGenerator,self)._handle_text(msg) diff --git a/Lib/email/message.py b/Lib/email/message.py index 922617a..91976f1 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -10,13 +10,12 @@ import re import uu import base64 import binascii -import warnings from io import BytesIO, StringIO # Intrapackage imports from email import utils from email import errors -from email import header +from email._policybase import compat32 from email import charset as _charset Charset = _charset.Charset @@ -26,24 +25,6 @@ SEMISPACE = '; ' # existence of which force quoting of the parameter value. tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]') -# How to figure out if we are processing strings that come from a byte -# source with undecodable characters. -_has_surrogates = re.compile( - '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search - - -# Helper functions -def _sanitize_header(name, value): - # If the header value contains surrogates, return a Header using - # the unknown-8bit charset to encode the bytes as encoded words. - if not isinstance(value, str): - # Assume it is already a header object - return value - if _has_surrogates(value): - return header.Header(value, charset=_charset.UNKNOWN8BIT, - header_name=name) - else: - return value def _splitparam(param): # Split header parameters. BAW: this may be too simple. It isn't @@ -136,7 +117,8 @@ class Message: you must use the explicit API to set or get all the headers. Not all of the mapping methods are implemented. """ - def __init__(self): + def __init__(self, policy=compat32): + self.policy = policy self._headers = [] self._unixfrom = None self._payload = None @@ -246,7 +228,7 @@ class Message: cte = str(self.get('content-transfer-encoding', '')).lower() # payload may be bytes here. if isinstance(payload, str): - if _has_surrogates(payload): + if utils._has_surrogates(payload): bpayload = payload.encode('ascii', 'surrogateescape') if not decode: try: @@ -362,7 +344,7 @@ class Message: Note: this does not overwrite an existing header with the same field name. Use __delitem__() first to delete any existing headers. """ - self._headers.append((name, val)) + self._headers.append(self.policy.header_store_parse(name, val)) def __delitem__(self, name): """Delete all occurrences of a header, if present. @@ -401,7 +383,8 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return [_sanitize_header(k, v) for k, v in self._headers] + return [self.policy.header_fetch_parse(k, v) + for k, v in self._headers] def items(self): """Get all the message's header fields and values. @@ -411,7 +394,8 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return [(k, _sanitize_header(k, v)) for k, v in self._headers] + return [(k, self.policy.header_fetch_parse(k, v)) + for k, v in self._headers] def get(self, name, failobj=None): """Get a header value. @@ -422,10 +406,29 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - return _sanitize_header(k, v) + return self.policy.header_fetch_parse(k, v) return failobj # + # "Internal" methods (public API, but only intended for use by a parser + # or generator, not normal application code. + # + + def set_raw(self, name, value): + """Store name and value in the model without modification. + + This is an "internal" API, intended only for use by a parser. + """ + self._headers.append((name, value)) + + def raw_items(self): + """Return the (name, value) header pairs without modification. + + This is an "internal" API, intended only for use by a generator. + """ + return iter(self._headers.copy()) + + # # Additional useful stuff # @@ -442,7 +445,7 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - values.append(_sanitize_header(k, v)) + values.append(self.policy.header_fetch_parse(k, v)) if not values: return failobj return values @@ -475,7 +478,7 @@ class Message: parts.append(_formatparam(k.replace('_', '-'), v)) if _value is not None: parts.insert(0, _value) - self._headers.append((_name, SEMISPACE.join(parts))) + self[_name] = SEMISPACE.join(parts) def replace_header(self, _name, _value): """Replace a header. @@ -487,7 +490,7 @@ class Message: _name = _name.lower() for i, (k, v) in zip(range(len(self._headers)), self._headers): if k.lower() == _name: - self._headers[i] = (k, _value) + self._headers[i] = self.policy.header_store_parse(k, _value) break else: raise KeyError(_name) @@ -805,7 +808,8 @@ class Message: parts.append(k) else: parts.append('%s=%s' % (k, v)) - newheaders.append((h, SEMISPACE.join(parts))) + val = SEMISPACE.join(parts) + newheaders.append(self.policy.header_store_parse(h, val)) else: newheaders.append((h, v)) diff --git a/Lib/email/parser.py b/Lib/email/parser.py index 0f92160..1aab012 100644 --- a/Lib/email/parser.py +++ b/Lib/email/parser.py @@ -11,12 +11,12 @@ from io import StringIO, TextIOWrapper from email.feedparser import FeedParser from email.message import Message -from email import policy +from email._policybase import compat32 class Parser: - def __init__(self, _class=Message, *, policy=policy.default): + def __init__(self, _class=Message, *, policy=compat32): """Parser of RFC 2822 and MIME email messages. Creates an in-memory object tree representing the email message, which diff --git a/Lib/email/policy.py b/Lib/email/policy.py index 585a752..dae2dc7 100644 --- a/Lib/email/policy.py +++ b/Lib/email/policy.py @@ -1,174 +1,12 @@ -"""Policy framework for the email package. - -Allows fine grained feature control of how the package parses and emits data. +"""This will be the home for the policy that hooks in the new +code that adds all the email6 features. """ -__all__ = [ - 'Policy', - 'default', - 'strict', - 'SMTP', - 'HTTP', - ] - - -class _PolicyBase: - - """Policy Object basic framework. - - This class is useless unless subclassed. A subclass should define - class attributes with defaults for any values that are to be - managed by the Policy object. The constructor will then allow - non-default values to be set for these attributes at instance - creation time. The instance will be callable, taking these same - attributes keyword arguments, and returning a new instance - identical to the called instance except for those values changed - by the keyword arguments. Instances may be added, yielding new - instances with any non-default values from the right hand - operand overriding those in the left hand operand. That is, - - A + B == A() - - The repr of an instance can be used to reconstruct the object - if and only if the repr of the values can be used to reconstruct - those values. - - """ - - def __init__(self, **kw): - """Create new Policy, possibly overriding some defaults. - - See class docstring for a list of overridable attributes. - - """ - for name, value in kw.items(): - if hasattr(self, name): - super(_PolicyBase,self).__setattr__(name, value) - else: - raise TypeError( - "{!r} is an invalid keyword argument for {}".format( - name, self.__class__.__name__)) - - def __repr__(self): - args = [ "{}={!r}".format(name, value) - for name, value in self.__dict__.items() ] - return "{}({})".format(self.__class__.__name__, ', '.join(args)) - - def clone(self, **kw): - """Return a new instance with specified attributes changed. - - The new instance has the same attribute values as the current object, - except for the changes passed in as keyword arguments. - - """ - for attr, value in self.__dict__.items(): - if attr not in kw: - kw[attr] = value - return self.__class__(**kw) - - def __setattr__(self, name, value): - if hasattr(self, name): - msg = "{!r} object attribute {!r} is read-only" - else: - msg = "{!r} object has no attribute {!r}" - raise AttributeError(msg.format(self.__class__.__name__, name)) - - def __add__(self, other): - """Non-default values from right operand override those from left. - - The object returned is a new instance of the subclass. - - """ - return self.clone(**other.__dict__) - - -class Policy(_PolicyBase): - - """Controls for how messages are interpreted and formatted. - - Most of the classes and many of the methods in the email package - accept Policy objects as parameters. A Policy object contains a set - of values and functions that control how input is interpreted and how - output is rendered. For example, the parameter 'raise_on_defect' - controls whether or not an RFC violation throws an error or not, - while 'max_line_length' controls the maximum length of output lines - when a Message is serialized. - - Any valid attribute may be overridden when a Policy is created by - passing it as a keyword argument to the constructor. Policy - objects are immutable, but a new Policy object can be created - with only certain values changed by calling the Policy instance - with keyword arguments. Policy objects can also be added, - producing a new Policy object in which the non-default attributes - set in the right hand operand overwrite those specified in the - left operand. - - Settable attributes: - - raise_on_defect -- If true, then defects should be raised - as errors. Default False. - - linesep -- string containing the value to use as - separation between output lines. Default '\n'. - - must_be_7bit -- output must contain only 7bit clean data. - Default False. - - max_line_length -- maximum length of lines, excluding 'linesep', - during serialization. None means no line - wrapping is done. Default is 78. - - Methods: - - register_defect(obj, defect) - defect is a Defect instance. The default implementation appends defect - to the objs 'defects' attribute. - - handle_defect(obj, defect) - intended to be called by parser code that finds a defect. If - raise_on_defect is True, defect is raised as an error, otherwise - register_defect is called. - - """ - - raise_on_defect = False - linesep = '\n' - must_be_7bit = False - max_line_length = 78 - - def handle_defect(self, obj, defect): - """Based on policy, either raise defect or call register_defect. - - handle_defect(obj, defect) - - defect should be a Defect subclass, but in any case must be an - Exception subclass. obj is the object on which the defect should be - registered if it is not raised. If the raise_on_defect is True, the - defect is raised as an error, otherwise the object and the defect are - passed to register_defect. - - This class is intended to be called by parsers that discover defects, - and will not be called from code using the library unless that code is - implementing an alternate parser. - - """ - if self.raise_on_defect: - raise defect - self.register_defect(obj, defect) - - def register_defect(self, obj, defect): - """Record 'defect' on 'obj'. - - Called by handle_defect if raise_on_defect is False. This method is - part of the Policy API so that Policy subclasses can implement custom - defect handling. The default implementation calls the append method - of the defects attribute of obj. - - """ - obj.defects.append(defect) +from email._policybase import Policy, compat32, Compat32 +# XXX: temporarily derive everything from compat32. -default = Policy() +default = compat32 strict = default.clone(raise_on_defect=True) SMTP = default.clone(linesep='\r\n') HTTP = default.clone(linesep='\r\n', max_line_length=None) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 138f05d..b82d5c5 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -57,6 +57,10 @@ TICK = "'" specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[\\"]') +# How to figure out if we are processing strings that come from a byte +# source with undecodable characters. +_has_surrogates = re.compile( + '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search # Helpers diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index d72b50e..b05fb3c 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -3,6 +3,8 @@ import sys import unittest import test.support import email +from email.message import Message +from email._policybase import compat32 from test.test_email import __file__ as landmark # Run all tests in package for '-m unittest test.test_email' @@ -36,16 +38,26 @@ def openfile(filename, *args, **kws): class TestEmailBase(unittest.TestCase): maxDiff = None + # Currently the default policy is compat32. By setting that as the default + # here we make minimal changes in the test_email tests compared to their + # pre-3.3 state. + policy = compat32 def __init__(self, *args, **kw): super().__init__(*args, **kw) self.addTypeEqualityFunc(bytes, self.assertBytesEqual) + # Backward compatibility to minimize test_email test changes. ndiffAssertEqual = unittest.TestCase.assertEqual def _msgobj(self, filename): with openfile(filename) as fp: - return email.message_from_file(fp) + return email.message_from_file(fp, policy=self.policy) + + def _str_msg(self, string, message=Message, policy=None): + if policy is None: + policy = self.policy + return email.message_from_string(string, message, policy=policy) def _bytes_repr(self, b): return [repr(x) for x in b.splitlines(keepends=True)] diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index b07f675..ac6ee65 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -16,6 +16,7 @@ from io import StringIO, BytesIO from itertools import chain import email +import email.policy from email.charset import Charset from email.header import Header, decode_header, make_header @@ -1805,11 +1806,7 @@ YXNkZg== # Test some badly formatted messages -class TestNonConformantBase: - - def _msgobj(self, filename): - with openfile(filename) as fp: - return email.message_from_file(fp, policy=self.policy) +class TestNonConformant(TestEmailBase): def test_parse_missing_minor_type(self): eq = self.assertEqual @@ -1818,24 +1815,26 @@ class TestNonConformantBase: eq(msg.get_content_maintype(), 'text') eq(msg.get_content_subtype(), 'plain') + # test_parser.TestMessageDefectDetectionBase def test_same_boundary_inner_outer(self): unless = self.assertTrue msg = self._msgobj('msg_15.txt') # XXX We can probably eventually do better inner = msg.get_payload(0) unless(hasattr(inner, 'defects')) - self.assertEqual(len(self.get_defects(inner)), 1) - unless(isinstance(self.get_defects(inner)[0], + self.assertEqual(len(inner.defects), 1) + unless(isinstance(inner.defects[0], errors.StartBoundaryNotFoundDefect)) + # test_parser.TestMessageDefectDetectionBase def test_multipart_no_boundary(self): unless = self.assertTrue msg = self._msgobj('msg_25.txt') unless(isinstance(msg.get_payload(), str)) - self.assertEqual(len(self.get_defects(msg)), 2) - unless(isinstance(self.get_defects(msg)[0], + self.assertEqual(len(msg.defects), 2) + unless(isinstance(msg.defects[0], errors.NoBoundaryInMultipartDefect)) - unless(isinstance(self.get_defects(msg)[1], + unless(isinstance(msg.defects[1], errors.MultipartInvariantViolationDefect)) multipart_msg = textwrap.dedent("""\ @@ -1861,27 +1860,26 @@ class TestNonConformantBase: --===============3344438784458119861==-- """) + # test_parser.TestMessageDefectDetectionBase def test_multipart_invalid_cte(self): - msg = email.message_from_string( - self.multipart_msg.format("\nContent-Transfer-Encoding: base64"), - policy = self.policy) - self.assertEqual(len(self.get_defects(msg)), 1) - self.assertIsInstance(self.get_defects(msg)[0], + msg = self._str_msg( + self.multipart_msg.format("\nContent-Transfer-Encoding: base64")) + self.assertEqual(len(msg.defects), 1) + self.assertIsInstance(msg.defects[0], errors.InvalidMultipartContentTransferEncodingDefect) + # test_parser.TestMessageDefectDetectionBase def test_multipart_no_cte_no_defect(self): - msg = email.message_from_string( - self.multipart_msg.format(''), - policy = self.policy) - self.assertEqual(len(self.get_defects(msg)), 0) + msg = self._str_msg(self.multipart_msg.format('')) + self.assertEqual(len(msg.defects), 0) + # test_parser.TestMessageDefectDetectionBase def test_multipart_valid_cte_no_defect(self): for cte in ('7bit', '8bit', 'BINary'): - msg = email.message_from_string( + msg = self._str_msg( self.multipart_msg.format( - "\nContent-Transfer-Encoding: {}".format(cte)), - policy = self.policy) - self.assertEqual(len(self.get_defects(msg)), 0) + "\nContent-Transfer-Encoding: {}".format(cte))) + self.assertEqual(len(msg.defects), 0) def test_invalid_content_type(self): eq = self.assertEqual @@ -1932,16 +1930,18 @@ Subject: here's something interesting counter to RFC 2822, there's no separating newline here """) + # test_parser.TestMessageDefectDetectionBase def test_lying_multipart(self): unless = self.assertTrue msg = self._msgobj('msg_41.txt') unless(hasattr(msg, 'defects')) - self.assertEqual(len(self.get_defects(msg)), 2) - unless(isinstance(self.get_defects(msg)[0], + self.assertEqual(len(msg.defects), 2) + unless(isinstance(msg.defects[0], errors.NoBoundaryInMultipartDefect)) - unless(isinstance(self.get_defects(msg)[1], + unless(isinstance(msg.defects[1], errors.MultipartInvariantViolationDefect)) + # test_parser.TestMessageDefectDetectionBase def test_missing_start_boundary(self): outer = self._msgobj('msg_42.txt') # The message structure is: @@ -1953,71 +1953,21 @@ counter to RFC 2822, there's no separating newline here # # [*] This message is missing its start boundary bad = outer.get_payload(1).get_payload(0) - self.assertEqual(len(self.get_defects(bad)), 1) - self.assertTrue(isinstance(self.get_defects(bad)[0], + self.assertEqual(len(bad.defects), 1) + self.assertTrue(isinstance(bad.defects[0], errors.StartBoundaryNotFoundDefect)) + # test_parser.TestMessageDefectDetectionBase def test_first_line_is_continuation_header(self): eq = self.assertEqual m = ' Line 1\nLine 2\nLine 3' - msg = email.message_from_string(m, policy=self.policy) + msg = email.message_from_string(m) eq(msg.keys(), []) eq(msg.get_payload(), 'Line 2\nLine 3') - eq(len(self.get_defects(msg)), 1) - self.assertTrue(isinstance(self.get_defects(msg)[0], + eq(len(msg.defects), 1) + self.assertTrue(isinstance(msg.defects[0], errors.FirstHeaderLineIsContinuationDefect)) - eq(self.get_defects(msg)[0].line, ' Line 1\n') - - -class TestNonConformant(TestNonConformantBase, TestEmailBase): - - policy=email.policy.default - - def get_defects(self, obj): - return obj.defects - - -class TestNonConformantCapture(TestNonConformantBase, TestEmailBase): - - class CapturePolicy(email.policy.Policy): - captured = None - def register_defect(self, obj, defect): - self.captured.append(defect) - - def setUp(self): - self.policy = self.CapturePolicy(captured=list()) - - def get_defects(self, obj): - return self.policy.captured - - -class TestRaisingDefects(TestEmailBase): - - def _msgobj(self, filename): - with openfile(filename) as fp: - return email.message_from_file(fp, policy=email.policy.strict) - - def test_same_boundary_inner_outer(self): - with self.assertRaises(errors.StartBoundaryNotFoundDefect): - self._msgobj('msg_15.txt') - - def test_multipart_no_boundary(self): - with self.assertRaises(errors.NoBoundaryInMultipartDefect): - self._msgobj('msg_25.txt') - - def test_lying_multipart(self): - with self.assertRaises(errors.NoBoundaryInMultipartDefect): - self._msgobj('msg_41.txt') - - - def test_missing_start_boundary(self): - with self.assertRaises(errors.StartBoundaryNotFoundDefect): - self._msgobj('msg_42.txt') - - def test_first_line_is_continuation_header(self): - m = ' Line 1\nLine 2\nLine 3' - with self.assertRaises(errors.FirstHeaderLineIsContinuationDefect): - msg = email.message_from_string(m, policy=email.policy.strict) + eq(msg.defects[0].line, ' Line 1\n') # Test RFC 2047 header encoding and decoding @@ -2610,6 +2560,13 @@ class TestMiscellaneous(TestEmailBase): for subpart in msg.walk(): unless(isinstance(subpart, MyMessage)) + def test_custom_message_does_not_require_arguments(self): + class MyMessage(Message): + def __init__(self): + super().__init__() + msg = self._str_msg("Subject: test\n\ntest", MyMessage) + self.assertTrue(isinstance(msg, MyMessage)) + def test__all__(self): module = __import__('email') self.assertEqual(sorted(module.__all__), [ @@ -3137,25 +3094,6 @@ Here's the message body g.flatten(msg, linesep='\r\n') self.assertEqual(s.getvalue(), text) - def test_crlf_control_via_policy(self): - with openfile('msg_26.txt', newline='\n') as fp: - text = fp.read() - msg = email.message_from_string(text) - s = StringIO() - g = email.generator.Generator(s, policy=email.policy.SMTP) - g.flatten(msg) - self.assertEqual(s.getvalue(), text) - - def test_flatten_linesep_overrides_policy(self): - # msg_27 is lf separated - with openfile('msg_27.txt', newline='\n') as fp: - text = fp.read() - msg = email.message_from_string(text) - s = StringIO() - g = email.generator.Generator(s, policy=email.policy.SMTP) - g.flatten(msg, linesep='\n') - self.assertEqual(s.getvalue(), text) - maxDiff = None def test_multipart_digest_with_extra_mime_headers(self): @@ -3646,44 +3584,6 @@ class Test8BitBytesHandling(unittest.TestCase): s.getvalue(), 'Subject: =?utf-8?b?xb5sdcWlb3XEjWvDvSBrxa/FiA==?=\r\n\r\n') - def test_crlf_control_via_policy(self): - # msg_26 is crlf terminated - with openfile('msg_26.txt', 'rb') as fp: - text = fp.read() - msg = email.message_from_bytes(text) - s = BytesIO() - g = email.generator.BytesGenerator(s, policy=email.policy.SMTP) - g.flatten(msg) - self.assertEqual(s.getvalue(), text) - - def test_flatten_linesep_overrides_policy(self): - # msg_27 is lf separated - with openfile('msg_27.txt', 'rb') as fp: - text = fp.read() - msg = email.message_from_bytes(text) - s = BytesIO() - g = email.generator.BytesGenerator(s, policy=email.policy.SMTP) - g.flatten(msg, linesep='\n') - self.assertEqual(s.getvalue(), text) - - def test_must_be_7bit_handles_unknown_8bit(self): - msg = email.message_from_bytes(self.non_latin_bin_msg) - out = BytesIO() - g = email.generator.BytesGenerator(out, - policy=email.policy.default.clone(must_be_7bit=True)) - g.flatten(msg) - self.assertEqual(out.getvalue(), - self.non_latin_bin_msg_as7bit_wrapped.encode('ascii')) - - def test_must_be_7bit_transforms_8bit_cte(self): - msg = email.message_from_bytes(self.latin_bin_msg) - out = BytesIO() - g = email.generator.BytesGenerator(out, - policy=email.policy.default.clone(must_be_7bit=True)) - g.flatten(msg) - self.assertEqual(out.getvalue(), - self.latin_bin_msg_as7bit.encode('ascii')) - maxDiff = None diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index 35ca6c5..8f5fde7 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -11,6 +11,8 @@ from test.test_email import TestEmailBase class TestGeneratorBase(): + policy = policy.compat32 + long_subject = { 0: textwrap.dedent("""\ To: whom_it_may_concern@example.com @@ -58,11 +60,11 @@ class TestGeneratorBase(): long_subject[100] = long_subject[0] def maxheaderlen_parameter_test(self, n): - msg = self.msgmaker(self.long_subject[0]) + msg = self.msgmaker(self.typ(self.long_subject[0])) s = self.ioclass() g = self.genclass(s, maxheaderlen=n) g.flatten(msg) - self.assertEqual(s.getvalue(), self.long_subject[n]) + self.assertEqual(s.getvalue(), self.typ(self.long_subject[n])) def test_maxheaderlen_parameter_0(self): self.maxheaderlen_parameter_test(0) @@ -77,11 +79,11 @@ class TestGeneratorBase(): self.maxheaderlen_parameter_test(20) def maxheaderlen_policy_test(self, n): - msg = self.msgmaker(self.long_subject[0]) + msg = self.msgmaker(self.typ(self.long_subject[0])) s = self.ioclass() g = self.genclass(s, policy=policy.default.clone(max_line_length=n)) g.flatten(msg) - self.assertEqual(s.getvalue(), self.long_subject[n]) + self.assertEqual(s.getvalue(), self.typ(self.long_subject[n])) def test_maxheaderlen_policy_0(self): self.maxheaderlen_policy_test(0) @@ -96,12 +98,12 @@ class TestGeneratorBase(): self.maxheaderlen_policy_test(20) def maxheaderlen_parm_overrides_policy_test(self, n): - msg = self.msgmaker(self.long_subject[0]) + msg = self.msgmaker(self.typ(self.long_subject[0])) s = self.ioclass() g = self.genclass(s, maxheaderlen=n, policy=policy.default.clone(max_line_length=10)) g.flatten(msg) - self.assertEqual(s.getvalue(), self.long_subject[n]) + self.assertEqual(s.getvalue(), self.typ(self.long_subject[n])) def test_maxheaderlen_parm_overrides_policy_0(self): self.maxheaderlen_parm_overrides_policy_test(0) @@ -115,21 +117,84 @@ class TestGeneratorBase(): def test_maxheaderlen_parm_overrides_policy_20(self): self.maxheaderlen_parm_overrides_policy_test(20) + def test_crlf_control_via_policy(self): + source = "Subject: test\r\n\r\ntest body\r\n" + expected = source + msg = self.msgmaker(self.typ(source)) + s = self.ioclass() + g = self.genclass(s, policy=policy.SMTP) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + + def test_flatten_linesep_overrides_policy(self): + source = "Subject: test\n\ntest body\n" + expected = source + msg = self.msgmaker(self.typ(source)) + s = self.ioclass() + g = self.genclass(s, policy=policy.SMTP) + g.flatten(msg, linesep='\n') + self.assertEqual(s.getvalue(), self.typ(expected)) + class TestGenerator(TestGeneratorBase, TestEmailBase): - msgmaker = staticmethod(message_from_string) genclass = Generator ioclass = io.StringIO + typ = str + + def msgmaker(self, msg, policy=None): + policy = self.policy if policy is None else policy + return message_from_string(msg, policy=policy) class TestBytesGenerator(TestGeneratorBase, TestEmailBase): - msgmaker = staticmethod(message_from_bytes) genclass = BytesGenerator ioclass = io.BytesIO - long_subject = {key: x.encode('ascii') - for key, x in TestGeneratorBase.long_subject.items()} + typ = lambda self, x: x.encode('ascii') + + def msgmaker(self, msg, policy=None): + policy = self.policy if policy is None else policy + return message_from_bytes(msg, policy=policy) + + def test_cte_type_7bit_handles_unknown_8bit(self): + source = ("Subject: Maintenant je vous présente mon " + "collègue\n\n").encode('utf-8') + expected = ('Subject: =?unknown-8bit?q?Maintenant_je_vous_pr=C3=A9sente_mon_' + 'coll=C3=A8gue?=\n\n').encode('ascii') + msg = message_from_bytes(source) + s = io.BytesIO() + g = BytesGenerator(s, policy=self.policy.clone(cte_type='7bit')) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_cte_type_7bit_transforms_8bit_cte(self): + source = textwrap.dedent("""\ + From: foo@bar.com + To: Dinsdale + Subject: Nudge nudge, wink, wink + Mime-Version: 1.0 + Content-Type: text/plain; charset="latin-1" + Content-Transfer-Encoding: 8bit + + oh là là, know what I mean, know what I mean? + """).encode('latin1') + msg = message_from_bytes(source) + expected = textwrap.dedent("""\ + From: foo@bar.com + To: Dinsdale + Subject: Nudge nudge, wink, wink + Mime-Version: 1.0 + Content-Type: text/plain; charset="iso-8859-1" + Content-Transfer-Encoding: quoted-printable + + oh l=E0 l=E0, know what I mean, know what I mean? + """).encode('ascii') + s = io.BytesIO() + g = BytesGenerator(s, policy=self.policy.clone(cte_type='7bit', + linesep='\n')) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) if __name__ == '__main__': diff --git a/Lib/test/test_email/test_parser.py b/Lib/test/test_email/test_parser.py new file mode 100644 index 0000000..864e4c1 --- /dev/null +++ b/Lib/test/test_email/test_parser.py @@ -0,0 +1,276 @@ +import io +import email +import textwrap +import unittest +from email._policybase import Compat32 +from email import errors +from email.message import Message +from test.test_email import TestEmailBase + + +class TestCustomMessage(TestEmailBase): + + class MyMessage(Message): + def __init__(self, policy): + self.check_policy = policy + super().__init__() + + MyPolicy = TestEmailBase.policy.clone(linesep='boo') + + def test_custom_message_gets_policy_if_possible_from_string(self): + msg = email.message_from_string("Subject: bogus\n\nmsg\n", + self.MyMessage, + policy=self.MyPolicy) + self.assertTrue(isinstance(msg, self.MyMessage)) + self.assertIs(msg.check_policy, self.MyPolicy) + + def test_custom_message_gets_policy_if_possible_from_file(self): + source_file = io.StringIO("Subject: bogus\n\nmsg\n") + msg = email.message_from_file(source_file, + self.MyMessage, + policy=self.MyPolicy) + self.assertTrue(isinstance(msg, self.MyMessage)) + self.assertIs(msg.check_policy, self.MyPolicy) + + # XXX add tests for other functions that take Message arg. + + +class TestMessageDefectDetectionBase: + + dup_boundary_msg = textwrap.dedent("""\ + Subject: XX + From: xx@xx.dk + To: XX + Mime-version: 1.0 + Content-type: multipart/mixed; + boundary="MS_Mac_OE_3071477847_720252_MIME_Part" + + --MS_Mac_OE_3071477847_720252_MIME_Part + Content-type: multipart/alternative; + boundary="MS_Mac_OE_3071477847_720252_MIME_Part" + + --MS_Mac_OE_3071477847_720252_MIME_Part + Content-type: text/plain; charset="ISO-8859-1" + Content-transfer-encoding: quoted-printable + + text + + --MS_Mac_OE_3071477847_720252_MIME_Part + Content-type: text/html; charset="ISO-8859-1" + Content-transfer-encoding: quoted-printable + + + + --MS_Mac_OE_3071477847_720252_MIME_Part-- + + --MS_Mac_OE_3071477847_720252_MIME_Part + Content-type: image/gif; name="xx.gif"; + Content-disposition: attachment + Content-transfer-encoding: base64 + + Some removed base64 encoded chars. + + --MS_Mac_OE_3071477847_720252_MIME_Part-- + + """) + + def test_same_boundary_inner_outer(self): + # XXX better would be to actually detect the duplicate. + msg = self._str_msg(self.dup_boundary_msg) + inner = msg.get_payload(0) + self.assertTrue(hasattr(inner, 'defects')) + self.assertEqual(len(self.get_defects(inner)), 1) + self.assertTrue(isinstance(self.get_defects(inner)[0], + errors.StartBoundaryNotFoundDefect)) + + def test_same_boundary_inner_outer_raises_on_defect(self): + with self.assertRaises(errors.StartBoundaryNotFoundDefect): + self._str_msg(self.dup_boundary_msg, + policy=self.policy.clone(raise_on_defect=True)) + + no_boundary_msg = textwrap.dedent("""\ + Date: Fri, 6 Apr 2001 09:23:06 -0800 (GMT-0800) + From: foobar + Subject: broken mail + MIME-Version: 1.0 + Content-Type: multipart/report; report-type=delivery-status; + + --JAB03225.986577786/zinfandel.lacita.com + + One part + + --JAB03225.986577786/zinfandel.lacita.com + Content-Type: message/delivery-status + + Header: Another part + + --JAB03225.986577786/zinfandel.lacita.com-- + """) + + def test_multipart_no_boundary(self): + msg = self._str_msg(self.no_boundary_msg) + self.assertTrue(isinstance(msg.get_payload(), str)) + self.assertEqual(len(self.get_defects(msg)), 2) + self.assertTrue(isinstance(self.get_defects(msg)[0], + errors.NoBoundaryInMultipartDefect)) + self.assertTrue(isinstance(self.get_defects(msg)[1], + errors.MultipartInvariantViolationDefect)) + + def test_multipart_no_boundary_raise_on_defect(self): + with self.assertRaises(errors.NoBoundaryInMultipartDefect): + self._str_msg(self.no_boundary_msg, + policy=self.policy.clone(raise_on_defect=True)) + + multipart_msg = textwrap.dedent("""\ + Date: Wed, 14 Nov 2007 12:56:23 GMT + From: foo@bar.invalid + To: foo@bar.invalid + Subject: Content-Transfer-Encoding: base64 and multipart + MIME-Version: 1.0 + Content-Type: multipart/mixed; + boundary="===============3344438784458119861=="{} + + --===============3344438784458119861== + Content-Type: text/plain + + Test message + + --===============3344438784458119861== + Content-Type: application/octet-stream + Content-Transfer-Encoding: base64 + + YWJj + + --===============3344438784458119861==-- + """) + + def test_multipart_invalid_cte(self): + msg = self._str_msg( + self.multipart_msg.format("\nContent-Transfer-Encoding: base64")) + self.assertEqual(len(self.get_defects(msg)), 1) + self.assertIsInstance(self.get_defects(msg)[0], + errors.InvalidMultipartContentTransferEncodingDefect) + + def test_multipart_invalid_cte_raise_on_defect(self): + with self.assertRaises( + errors.InvalidMultipartContentTransferEncodingDefect): + self._str_msg( + self.multipart_msg.format( + "\nContent-Transfer-Encoding: base64"), + policy=self.policy.clone(raise_on_defect=True)) + + def test_multipart_no_cte_no_defect(self): + msg = self._str_msg(self.multipart_msg.format('')) + self.assertEqual(len(self.get_defects(msg)), 0) + + def test_multipart_valid_cte_no_defect(self): + for cte in ('7bit', '8bit', 'BINary'): + msg = self._str_msg( + self.multipart_msg.format("\nContent-Transfer-Encoding: "+cte)) + self.assertEqual(len(self.get_defects(msg)), 0, "cte="+cte) + + lying_multipart_msg = textwrap.dedent("""\ + From: "Allison Dunlap" + To: yyy@example.com + Subject: 64423 + Date: Sun, 11 Jul 2004 16:09:27 -0300 + MIME-Version: 1.0 + Content-Type: multipart/alternative; + + Blah blah blah + """) + + def test_lying_multipart(self): + msg = self._str_msg(self.lying_multipart_msg) + self.assertTrue(hasattr(msg, 'defects')) + self.assertEqual(len(self.get_defects(msg)), 2) + self.assertTrue(isinstance(self.get_defects(msg)[0], + errors.NoBoundaryInMultipartDefect)) + self.assertTrue(isinstance(self.get_defects(msg)[1], + errors.MultipartInvariantViolationDefect)) + + def test_lying_multipart_raise_on_defect(self): + with self.assertRaises(errors.NoBoundaryInMultipartDefect): + self._str_msg(self.lying_multipart_msg, + policy=self.policy.clone(raise_on_defect=True)) + + missing_start_boundary_msg = textwrap.dedent("""\ + Content-Type: multipart/mixed; boundary="AAA" + From: Mail Delivery Subsystem + To: yyy@example.com + + --AAA + + Stuff + + --AAA + Content-Type: message/rfc822 + + From: webmaster@python.org + To: zzz@example.com + Content-Type: multipart/mixed; boundary="BBB" + + --BBB-- + + --AAA-- + + """) + + def test_missing_start_boundary(self): + # The message structure is: + # + # multipart/mixed + # text/plain + # message/rfc822 + # multipart/mixed [*] + # + # [*] This message is missing its start boundary + outer = self._str_msg(self.missing_start_boundary_msg) + bad = outer.get_payload(1).get_payload(0) + self.assertEqual(len(self.get_defects(bad)), 1) + self.assertTrue(isinstance(self.get_defects(bad)[0], + errors.StartBoundaryNotFoundDefect)) + + def test_missing_start_boundary_raise_on_defect(self): + with self.assertRaises(errors.StartBoundaryNotFoundDefect): + self._str_msg(self.missing_start_boundary_msg, + policy=self.policy.clone(raise_on_defect=True)) + + def test_first_line_is_continuation_header(self): + msg = self._str_msg(' Line 1\nLine 2\nLine 3') + self.assertEqual(msg.keys(), []) + self.assertEqual(msg.get_payload(), 'Line 2\nLine 3') + self.assertEqual(len(self.get_defects(msg)), 1) + self.assertTrue(isinstance(self.get_defects(msg)[0], + errors.FirstHeaderLineIsContinuationDefect)) + self.assertEqual(self.get_defects(msg)[0].line, ' Line 1\n') + + def test_first_line_is_continuation_header_raise_on_defect(self): + with self.assertRaises(errors.FirstHeaderLineIsContinuationDefect): + self._str_msg(' Line 1\nLine 2\nLine 3', + policy=self.policy.clone(raise_on_defect=True)) + + +class TestMessageDefectDetection(TestMessageDefectDetectionBase, TestEmailBase): + + def get_defects(self, obj): + return obj.defects + + +class TestMessageDefectDetectionCapture(TestMessageDefectDetectionBase, + TestEmailBase): + + class CapturePolicy(Compat32): + captured = None + def register_defect(self, obj, defect): + self.captured.append(defect) + + def setUp(self): + self.policy = self.CapturePolicy(captured=list()) + + def get_defects(self, obj): + return self.policy.captured + + +if __name__ == '__main__': + unittest.main() diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py index 1c65901..07925a7 100644 --- a/Lib/test/test_email/test_policy.py +++ b/Lib/test/test_email/test_policy.py @@ -1,6 +1,10 @@ +import io import types +import textwrap import unittest import email.policy +import email.parser +import email.generator class PolicyAPITests(unittest.TestCase): @@ -11,14 +15,15 @@ class PolicyAPITests(unittest.TestCase): policy_defaults = { 'max_line_length': 78, 'linesep': '\n', - 'must_be_7bit': False, + 'cte_type': '8bit', 'raise_on_defect': False, } # For each policy under test, we give here the values of the attributes # that are different from the defaults for that policy. policies = { - email.policy.Policy(): {}, + email.policy.Compat32(): {}, + email.policy.compat32: {}, email.policy.default: {}, email.policy.SMTP: {'linesep': '\r\n'}, email.policy.HTTP: {'linesep': '\r\n', 'max_line_length': None}, @@ -44,6 +49,18 @@ class PolicyAPITests(unittest.TestCase): self.assertIn(attr, self.policy_defaults, "{} is not fully tested".format(attr)) + def test_abc(self): + with self.assertRaises(TypeError) as cm: + email.policy.Policy() + msg = str(cm.exception) + abstract_methods = ('fold', + 'fold_binary', + 'header_fetch_parse', + 'header_source_parse', + 'header_store_parse') + for method in abstract_methods: + self.assertIn(method, msg) + def test_policy_is_immutable(self): for policy in self.policies: for attr in self.policy_defaults: @@ -88,7 +105,7 @@ class PolicyAPITests(unittest.TestCase): self.defects = [] obj = Dummy() defect = object() - policy = email.policy.Policy() + policy = email.policy.Compat32() policy.register_defect(obj, defect) self.assertEqual(obj.defects, [defect]) defect2 = object() @@ -117,7 +134,7 @@ class PolicyAPITests(unittest.TestCase): email.policy.default.handle_defect(foo, defect2) self.assertEqual(foo.defects, [defect1, defect2]) - class MyPolicy(email.policy.Policy): + class MyPolicy(email.policy.Compat32): defects = None def __init__(self, *args, **kw): super().__init__(*args, defects=[], **kw) @@ -146,5 +163,94 @@ class PolicyAPITests(unittest.TestCase): # For adding subclassed objects, make sure the usual rules apply (subclass # wins), but that the order still works (right overrides left). + +class TestPolicyPropagation(unittest.TestCase): + + # The abstract methods are used by the parser but not by the wrapper + # functions that call it, so if the exception gets raised we know that the + # policy was actually propagated all the way to feedparser. + class MyPolicy(email.policy.Policy): + def badmethod(self, *args, **kw): + raise Exception("test") + fold = fold_binary = header_fetch_parser = badmethod + header_source_parse = header_store_parse = badmethod + + def test_message_from_string(self): + with self.assertRaisesRegex(Exception, "^test$"): + email.message_from_string("Subject: test\n\n", + policy=self.MyPolicy) + + def test_message_from_bytes(self): + with self.assertRaisesRegex(Exception, "^test$"): + email.message_from_bytes(b"Subject: test\n\n", + policy=self.MyPolicy) + + def test_message_from_file(self): + f = io.StringIO('Subject: test\n\n') + with self.assertRaisesRegex(Exception, "^test$"): + email.message_from_file(f, policy=self.MyPolicy) + + def test_message_from_binary_file(self): + f = io.BytesIO(b'Subject: test\n\n') + with self.assertRaisesRegex(Exception, "^test$"): + email.message_from_binary_file(f, policy=self.MyPolicy) + + # These are redundant, but we need them for black-box completeness. + + def test_parser(self): + p = email.parser.Parser(policy=self.MyPolicy) + with self.assertRaisesRegex(Exception, "^test$"): + p.parsestr('Subject: test\n\n') + + def test_bytes_parser(self): + p = email.parser.BytesParser(policy=self.MyPolicy) + with self.assertRaisesRegex(Exception, "^test$"): + p.parsebytes(b'Subject: test\n\n') + + # Now that we've established that all the parse methods get the + # policy in to feedparser, we can use message_from_string for + # the rest of the propagation tests. + + def _make_msg(self, source='Subject: test\n\n', policy=None): + self.policy = email.policy.default.clone() if policy is None else policy + return email.message_from_string(source, policy=self.policy) + + def test_parser_propagates_policy_to_message(self): + msg = self._make_msg() + self.assertIs(msg.policy, self.policy) + + def test_parser_propagates_policy_to_sub_messages(self): + msg = self._make_msg(textwrap.dedent("""\ + Subject: mime test + MIME-Version: 1.0 + Content-Type: multipart/mixed, boundary="XXX" + + --XXX + Content-Type: text/plain + + test + --XXX + Content-Type: text/plain + + test2 + --XXX-- + """)) + for part in msg.walk(): + self.assertIs(part.policy, self.policy) + + def test_message_policy_propagates_to_generator(self): + msg = self._make_msg("Subject: test\nTo: foo\n\n", + policy=email.policy.default.clone(linesep='X')) + s = io.StringIO() + g = email.generator.Generator(s) + g.flatten(msg) + self.assertEqual(s.getvalue(), "Subject: testXTo: fooXX") + + def test_message_policy_used_by_as_string(self): + msg = self._make_msg("Subject: test\nTo: foo\n\n", + policy=email.policy.default.clone(linesep='X')) + self.assertEqual(msg.as_string(), "Subject: testXTo: fooXX") + + if __name__ == '__main__': unittest.main() -- cgit v0.12