#11731: simplify/enhance parser/generator API by introducing policy objects.

This new interface will also allow for future planned enhancements in control over the parser/generator without requiring any additional complexity in the parser/generator API. Patch reviewed by Éric Araujo and Barry Warsaw.
author: R David Murray <rdmurray@bitdance.com> 2011-04-18 17:59:37 (GMT)
committer: R David Murray <rdmurray@bitdance.com> 2011-04-18 17:59:37 (GMT)
commit: 3edd22ac950d3a2bcc1ad2e5a83554970aef3369 (patch)
tree: b4661afc1be45e0d072c1c83ab354b2362f05afb /Lib/email
parent: ce16be91dc68597b0c5bfc7b4b1c5136fe5697a6 (diff)
download: cpython-3edd22ac950d3a2bcc1ad2e5a83554970aef3369.zip
cpython-3edd22ac950d3a2bcc1ad2e5a83554970aef3369.tar.gz
cpython-3edd22ac950d3a2bcc1ad2e5a83554970aef3369.tar.bz2
5 files changed, 238 insertions, 34 deletions
diff --git a/Lib/email/errors.py b/Lib/email/errors.py
index d52a624..c2ea7d4 100644
--- a/Lib/email/errors.py
+++ b/Lib/email/errors.py
@@ -32,7 +32,7 @@ class CharsetError(MessageError):
 
 
 # These are parsing defects which the parser was able to work around.
-class MessageDefect:
+class MessageDefect(Exception):
     """Base class for a message defect."""
 
     def __init__(self, line=None):
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 1b752d0..60de49e 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -25,6 +25,7 @@ import re
 
 from email import errors
 from email import message
+from email import policy
 
 NLCRE = re.compile('\r\n|\r|\n')
 NLCRE_bol = re.compile('(\r\n|\r|\n)')
@@ -134,9 +135,16 @@ class BufferedSubFile(object):
 class FeedParser:
     """A feed-style parser of email."""
 
-    def __init__(self, _factory=message.Message):
-        """_factory is called with no arguments to create a new message obj"""
+    def __init__(self, _factory=message.Message, *, policy=policy.default):
+        """_factory is called with no arguments to create a new message obj
+
+        The policy keyword specifies a policy object that controls a number of
+        aspects of the parser's operation.  The default policy maintains
+        backward compatibility.
+
+        """
         self._factory = _factory
+        self.policy = policy
         self._input = BufferedSubFile()
         self._msgstack = []
         self._parse = self._parsegen().__next__
@@ -168,7 +176,8 @@ class FeedParser:
         # Look for final set of defects
         if root.get_content_maintype() == 'multipart' \
                and not root.is_multipart():
-            root.defects.append(errors.MultipartInvariantViolationDefect())
+            defect = errors.MultipartInvariantViolationDefect()
+            self.policy.handle_defect(root, defect)
         return root
 
     def _new_message(self):
@@ -281,7 +290,8 @@ class FeedParser:
                 # defined a boundary.  That's a problem which we'll handle by
                 # reading everything until the EOF and marking the message as
                 # defective.
-                self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
+                defect = errors.NoBoundaryInMultipartDefect()
+                self.policy.handle_defect(self._cur, defect)
                 lines = []
                 for line in self._input:
                     if line is NeedMoreData:
@@ -385,7 +395,8 @@ class FeedParser:
             # that as a defect and store the captured text as the payload.
             # Everything from here to the EOF is epilogue.
             if capturing_preamble:
-                self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
+                defect = errors.StartBoundaryNotFoundDefect()
+                self.policy.handle_defect(self._cur, defect)
                 self._cur.set_payload(EMPTYSTRING.join(preamble))
                 epilogue = []
                 for line in self._input:
@@ -437,7 +448,7 @@ class FeedParser:
                     # is illegal, so let's note the defect, store the illegal
                     # line, and ignore it for purposes of headers.
                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
-                    self._cur.defects.append(defect)
+                    self.policy.handle_defect(self._cur, defect)
                     continue
                 lastvalue.append(line)
                 continue
diff --git a/Lib/email/generator.py b/Lib/email/generator.py
index fdd34e4..d8b8fa9 100644
--- a/Lib/email/generator.py
+++ b/Lib/email/generator.py
@@ -13,8 +13,10 @@ import random
 import warnings
 
 from io import StringIO, BytesIO
+from email import policy
 from email.header import Header
 from email.message import _has_surrogates
+import email.charset as _charset
 
 UNDERSCORE = '_'
 NL = '\n'  # XXX: no longer used by the code below.
@@ -33,7 +35,8 @@ class Generator:
     # Public interface
     #
 
-    def __init__(self, outfp, mangle_from_=True, maxheaderlen=78):
+    def __init__(self, outfp, mangle_from_=True, maxheaderlen=None, *,
+                 policy=policy.default):
         """Create the generator for message flattening.
 
         outfp is the output file-like object for writing the message to.  It
@@ -49,16 +52,23 @@ class Generator:
         defined in the Header class.  Set maxheaderlen to zero to disable
         header wrapping.  The default is 78, as recommended (but not required)
         by RFC 2822.
+
+        The policy keyword specifies a policy object that controls a number of
+        aspects of the generator's operation.  The default policy maintains
+        backward compatibility.
+
         """
         self._fp = outfp
         self._mangle_from_ = mangle_from_
-        self._maxheaderlen = maxheaderlen
+        self._maxheaderlen = (maxheaderlen if maxheaderlen is not None else
+                                 policy.max_line_length)
+        self.policy = policy
 
     def write(self, s):
         # Just delegate to the file object
         self._fp.write(s)
 
-    def flatten(self, msg, unixfrom=False, linesep='\n'):
+    def flatten(self, msg, unixfrom=False, linesep=None):
         r"""Print the message object tree rooted at msg to the output file
         specified when the Generator instance was created.
 
@@ -70,17 +80,15 @@ class Generator:
         Note that for subobjects, no From_ line is printed.
 
         linesep specifies the characters used to indicate a new line in
-        the output.  The default value is the most useful for typical
-        Python applications, but it can be set to \r\n to produce RFC-compliant
-        line separators when needed.
+        the output.  The default value is determined by the policy.
 
         """
         # We use the _XXX constants for operating on data that comes directly
         # from the msg, and _encoded_XXX constants for operating on data that
         # has already been converted (to bytes in the BytesGenerator) and
         # inserted into a temporary buffer.
-        self._NL = linesep
-        self._encoded_NL = self._encode(linesep)
+        self._NL = linesep if linesep is not None else self.policy.linesep
+        self._encoded_NL = self._encode(self._NL)
         self._EMPTY = ''
         self._encoded_EMTPY = self._encode('')
         if unixfrom:
@@ -338,7 +346,10 @@ class BytesGenerator(Generator):
 
     Functionally identical to the base Generator except that the output is
     bytes and not string.  When surrogates were used in the input to encode
-    bytes, these are decoded back to bytes for output.
+    bytes, these are decoded back to bytes for output.  If the policy has
+    must_be_7bit set true, then the message is transformed such that the
+    non-ASCII bytes are properly content transfer encoded, using the
+    charset unknown-8bit.
 
     The outfp object must accept bytes in its write method.
     """
@@ -361,21 +372,22 @@ class BytesGenerator(Generator):
         # strings with 8bit bytes.
         for h, v in msg._headers:
             self.write('%s: ' % h)
-            if isinstance(v, Header):
-                self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
-            elif _has_surrogates(v):
-                # If we have raw 8bit data in a byte string, we have no idea
-                # what the encoding is.  There is no safe way to split this
-                # string.  If it's ascii-subset, then we could do a normal
-                # ascii split, but if it's multibyte then we could break the
-                # string.  There's no way to know so the least harm seems to
-                # be to not split the string and risk it being too long.
-                self.write(v+NL)
-            else:
-                # Header's got lots of smarts and this string is safe...
-                header = Header(v, maxlinelen=self._maxheaderlen,
-                                header_name=h)
-                self.write(header.encode(linesep=self._NL)+self._NL)
+            if isinstance(v, str):
+                if _has_surrogates(v):
+                    if not self.policy.must_be_7bit:
+                        # If we have raw 8bit data in a byte string, we have no idea
+                        # what the encoding is.  There is no safe way to split this
+                        # string.  If it's ascii-subset, then we could do a normal
+                        # ascii split, but if it's multibyte then we could break the
+                        # string.  There's no way to know so the least harm seems to
+                        # be to not split the string and risk it being too long.
+                        self.write(v+NL)
+                        continue
+                    h = Header(v, charset=_charset.UNKNOWN8BIT, header_name=h)
+                else:
+                    h = Header(v, header_name=h)
+            self.write(h.encode(linesep=self._NL,
+                                maxlinelen=self._maxheaderlen)+self._NL)
         # A blank line always separates headers from body
         self.write(self._NL)
 
@@ -384,7 +396,7 @@ class BytesGenerator(Generator):
         # just write it back out.
         if msg._payload is None:
             return
-        if _has_surrogates(msg._payload):
+        if _has_surrogates(msg._payload) and not self.policy.must_be_7bit:
             self.write(msg._payload)
         else:
             super(BytesGenerator,self)._handle_text(msg)
diff --git a/Lib/email/parser.py b/Lib/email/parser.py
index fc5090b..0f92160 100644
--- a/Lib/email/parser.py
+++ b/Lib/email/parser.py
@@ -11,11 +11,12 @@ from io import StringIO, TextIOWrapper
 
 from email.feedparser import FeedParser
 from email.message import Message
+from email import policy
 
 
 
 class Parser:
-    def __init__(self, _class=Message):
+    def __init__(self, _class=Message, *, policy=policy.default):
         """Parser of RFC 2822 and MIME email messages.
 
         Creates an in-memory object tree representing the email message, which
@@ -30,8 +31,14 @@ class Parser:
         _class is the class to instantiate for new message objects when they
         must be created.  This class must have a constructor that can take
         zero arguments.  Default is Message.Message.
+
+        The policy keyword specifies a policy object that controls a number of
+        aspects of the parser's operation.  The default policy maintains
+        backward compatibility.
+
         """
         self._class = _class
+        self.policy = policy
 
     def parse(self, fp, headersonly=False):
         """Create a message structure from the data in a file.
@@ -41,7 +48,7 @@ class Parser:
         parsing after reading the headers or not.  The default is False,
         meaning it parses the entire contents of the file.
         """
-        feedparser = FeedParser(self._class)
+        feedparser = FeedParser(self._class, policy=self.policy)
         if headersonly:
             feedparser._set_headersonly()
         while True:
diff --git a/Lib/email/policy.py b/Lib/email/policy.py
new file mode 100644
index 0000000..88877a2
--- /dev/null
+++ b/Lib/email/policy.py
@@ -0,0 +1,174 @@
+"""Policy framework for the email package.
+
+Allows fine grained feature control of how the package parses and emits data.
+"""
+
+__all__ = [
+    'Policy',
+    'default',
+    'strict',
+    'SMTP',
+    'HTTP',
+    ]
+
+
+class _PolicyBase:
+
+    """Policy Object basic framework.
+
+    This class is useless unless subclassed.  A subclass should define
+    class attributes with defaults for any values that are to be
+    managed by the Policy object.  The constructor will then allow
+    non-default values to be set for these attributes at instance
+    creation time.  The instance will be callable, taking these same
+    attributes keyword arguments, and returning a new instance
+    identical to the called instance except for those values changed
+    by the keyword arguments.  Instances may be added, yielding new
+    instances with any non-default values from the right hand
+    operand overriding those in the left hand operand.  That is,
+
+        A + B == A(<non-default values of B>)
+
+    The repr of an instance can be used to reconstruct the object
+    if and only if the repr of the values can be used to reconstruct
+    those values.
+
+    """
+
+    def __init__(self, **kw):
+        """Create new Policy, possibly overriding some defaults.
+
+        See class docstring for a list of overridable attributes.
+
+        """
+        for name, value in kw.items():
+            if hasattr(self, name):
+                super(_PolicyBase,self).__setattr__(name, value)
+            else:
+                raise TypeError(
+                    "{!r} is an invalid keyword argument for {}".format(
+                        name, self.__class__.__name__))
+
+    def __repr__(self):
+        args = [ "{}={!r}".format(name, value)
+                 for name, value in self.__dict__.items() ]
+        return "{}({})".format(self.__class__.__name__, args if args else '')
+
+    def clone(self, **kw):
+        """Return a new instance with specified attributes changed.
+
+        The new instance has the same attribute values as the current object,
+        except for the changes passed in as keyword arguments.
+
+        """
+        for attr, value in self.__dict__.items():
+            if attr not in kw:
+                kw[attr] = value
+        return self.__class__(**kw)
+
+    def __setattr__(self, name, value):
+        if hasattr(self, name):
+            msg = "{!r} object attribute {!r} is read-only"
+        else:
+            msg = "{!r} object has no attribute {!r}"
+        raise AttributeError(msg.format(self.__class__.__name__, name))
+
+    def __add__(self, other):
+        """Non-default values from right operand override those from left.
+
+        The object returned is a new instance of the subclass.
+
+        """
+        return self.clone(**other.__dict__)
+
+
+class Policy(_PolicyBase):
+
+    """Controls for how messages are interpreted and formatted.
+
+    Most of the classes and many of the methods in the email package
+    accept Policy objects as parameters.  A Policy object contains a set
+    of values and functions that control how input is interpreted and how
+    output is rendered.  For example, the parameter 'raise_on_defect'
+    controls whether or not an RFC violation throws an error or not,
+    while 'max_line_length' controls the maximum length of output lines
+    when a Message is serialized.
+
+    Any valid attribute may be overridden when a Policy is created by
+    passing it as a keyword argument to the constructor.  Policy
+    objects are immutable, but a new Policy object can be created
+    with only certain values changed by calling the Policy instance
+    with keyword arguments.  Policy objects can also be added,
+    producing a new Policy object in which the non-default attributes
+    set in the right hand operand overwrite those specified in the
+    left operand.
+
+    Settable attributes:
+
+    raise_on_defect     -- If true, then defects should be raised
+                           as errors.  Default False.
+
+    linesep             -- string containing the value to use as
+                           separation between output lines.  Default '\n'.
+
+    must_be_7bit        -- output must contain only 7bit clean data.
+                           Default False.
+
+    max_line_length     -- maximum length of lines, excluding 'linesep',
+                           during serialization.  None means no line
+                           wrapping is done.  Default is 78.
+
+    Methods:
+
+    register_defect(obj, defect)
+        defect is a Defect instance.  The default implementation appends defect
+        to the objs 'defects' attribute.
+
+    handle_defect(obj, defect)
+        intended to be called by parser code that finds a defect.  If
+        raise_on_defect is True, defect is raised as an error, otherwise
+        register_defect is called.
+
+    """
+
+    raise_on_defect = False
+    linesep = '\n'
+    must_be_7bit = False
+    max_line_length = 78
+
+    def handle_defect(self, obj, defect):
+        """Based on policy, either raise defect or call register_defect.
+
+            handle_defect(obj, defect)
+
+        defect should be a Defect subclass, but in any case must be an
+        Exception subclass.  obj is the object on which the defect should be
+        registered if it is not raised.  If the raise_on_defect is True, the
+        defect is raised as an error, otherwise the object and the defect are
+        passed to register_defect.
+
+        This class is intended to be called by parsers that discover defects,
+        and will not be called from code using the library unless that code is
+        implementing an alternate parser.
+
+        """
+        if self.raise_on_defect:
+            raise defect
+        self.register_defect(obj, defect)
+
+    def register_defect(self, obj, defect):
+        """Record 'defect' on 'obj'.
+
+        Called by handle_defect if raise_on_defect is False.  This method is
+        part of the Policy API so that Policy subclasses can implement custom
+        defect handling.  The default implementation calls the append method
+        of the defects attribute of obj.
+
+        """
+        obj.defects.append(defect)
+
+
+default = Policy()
+strict = default.clone(raise_on_defect=True)
+SMTP = default.clone(linesep='\r\n')
+HTTP = default.clone(linesep='\r\n', max_line_length=None)
author	R David Murray <rdmurray@bitdance.com>	2011-04-18 17:59:37 (GMT)
committer	R David Murray <rdmurray@bitdance.com>	2011-04-18 17:59:37 (GMT)
commit	3edd22ac950d3a2bcc1ad2e5a83554970aef3369 (patch)
tree	b4661afc1be45e0d072c1c83ab354b2362f05afb /Lib/email
parent	ce16be91dc68597b0c5bfc7b4b1c5136fe5697a6 (diff)
download	cpython-3edd22ac950d3a2bcc1ad2e5a83554970aef3369.zip cpython-3edd22ac950d3a2bcc1ad2e5a83554970aef3369.tar.gz cpython-3edd22ac950d3a2bcc1ad2e5a83554970aef3369.tar.bz2