From 0813d76cb09f4672bdf893fb4a6eef49a00593db Mon Sep 17 00:00:00 2001
From: Thomas Wouters <thomas@python.org>
Date: Sat, 20 Mar 2004 17:31:29 +0000
Subject: Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================
---
 Lib/email/Parser.py | 301 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 173 insertions(+), 128 deletions(-)

diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py
index 09fac45..3fe1990 100644
--- a/Lib/email/Parser.py
+++ b/Lib/email/Parser.py
@@ -22,6 +22,75 @@ except NameError:
 
 NLCRE = re.compile('\r\n|\r|\n')
 
+class TextUtil:
+    """ A utility class for wrapping a file object and providing a 
+        couple of additional useful functions.
+    """
+
+    def __init__(self, fp):
+        self.fp = fp
+        self.unread = []
+
+    def readline(self):
+        """ Return a line of data.
+
+        If data has been pushed back with unreadline(), the most recently
+        returned unreadline()d data will be returned.
+        """
+        if self.unread:
+            return self.unread.pop()
+        else:
+            return self.fp.readline()
+
+    def unreadline(self, line):
+        """Push a line back into the object. 
+        """
+        self.unread.append(line)
+
+    def peekline(self):
+        """Non-destructively look at the next line"""
+        line = self.readline()
+        self.unreadline(line)
+        return line
+
+    def read(self):
+        """Return the remaining data
+        """
+        r = self.fp.read()
+        if self.unread:
+            r = "\n".join(self.unread) + r
+            self.unread = []
+        return r
+
+    def readuntil(self, re, afterblank=0, includematch=0):
+        """Read a line at a time until we get the specified RE. 
+
+        Returns the text up to (and including, if includematch is true) the 
+        matched text, and the RE match object. If afterblank is true, 
+        there must be a blank line before the matched text. Moves current 
+        filepointer to the line following the matched line. If we reach 
+        end-of-file, return what we've got so far, and return None as the
+        RE match object.
+        """
+        prematch = []
+        blankseen = 0
+        while 1:
+            line = self.readline()
+            if not line:
+                # end of file
+                return EMPTYSTRING.join(prematch), None
+            if afterblank:
+                if NLCRE.match(line):
+                    blankseen = 1
+                    continue
+                else:
+                    blankseen = 0
+            m = re.match(line)
+            if (m and not afterblank) or (m and afterblank and blankseen):
+                if includematch:
+                    prematch.append(line)
+                return EMPTYSTRING.join(prematch), m
+            prematch.append(line)
 
 
 class Parser:
@@ -59,9 +128,13 @@ class Parser:
         meaning it parses the entire contents of the file.
         """
         root = self._class()
-        firstbodyline = self._parseheaders(root, fp)
+        fp = TextUtil(fp)
+        self._parseheaders(root, fp)
         if not headersonly:
-            self._parsebody(root, fp, firstbodyline)
+            obj = self._parsemessage(root, fp)
+            trailer = fp.read()
+            if obj and trailer:
+                self._attach_trailer(obj, trailer)
         return root
 
     def parsestr(self, text, headersonly=False):
@@ -80,7 +153,6 @@ class Parser:
         lastheader = ''
         lastvalue = []
         lineno = 0
-        firstbodyline = None
         while True:
             # Don't strip the line before we test for the end condition,
             # because whitespace-only header lines are RFC compliant
@@ -129,7 +201,7 @@ class Parser:
                     # There was no separating blank line as mandated by RFC
                     # 2822, but we're in non-strict mode.  So just offer up
                     # this current line as the first body line.
-                    firstbodyline = line
+                    fp.unreadline(line)
                     break
             if lastheader:
                 container[lastheader] = NL.join(lastvalue)
@@ -138,140 +210,114 @@ class Parser:
         # Make sure we retain the last header
         if lastheader:
             container[lastheader] = NL.join(lastvalue)
-        return firstbodyline
+        return 
 
-    def _parsebody(self, container, fp, firstbodyline=None):
-        # Parse the body, but first split the payload on the content-type
-        # boundary if present.
+    def _parsemessage(self, container, fp):
+        # Parse the body. We walk through the body from top to bottom,
+        # keeping track of the current multipart nesting as we go.
+        # We return the object that gets the data at the end of this 
+        # block.
         boundary = container.get_boundary()
         isdigest = (container.get_content_type() == 'multipart/digest')
-        # If there's a boundary, split the payload text into its constituent
-        # parts and parse each separately.  Otherwise, just parse the rest of
-        # the body as a single message.  Note: any exceptions raised in the
-        # recursive parse need to have their line numbers coerced.
-        if boundary:
-            preamble = epilogue = None
-            # Split into subparts.  The first boundary we're looking for won't
-            # always have a leading newline since we're at the start of the
-            # body text, and there's not always a preamble before the first
-            # boundary.
+        if boundary: 
             separator = '--' + boundary
-            payload = fp.read()
-            if firstbodyline is not None:
-                payload = firstbodyline + '\n' + payload
-            # We use an RE here because boundaries can have trailing
-            # whitespace.
-            mo = re.search(
-                r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
-                payload)
-            if not mo:
-                if self._strict:
-                    raise Errors.BoundaryError(
-                        "Couldn't find starting boundary: %s" % boundary)
-                container.set_payload(payload)
-                return
-            start = mo.start()
-            if start > 0:
-                # there's some pre-MIME boundary preamble
-                preamble = payload[0:start]
-            # Find out what kind of line endings we're using
-            start += len(mo.group('sep')) + len(mo.group('ws'))
-            mo = NLCRE.search(payload, start)
-            if mo:
-                start += len(mo.group(0))
-            # We create a compiled regexp first because we need to be able to
-            # specify the start position, and the module function doesn't
-            # support this signature. :(
-            cre = re.compile('(?P<sep>\r\n|\r|\n)' +
-                             re.escape(separator) + '--')
-            mo = cre.search(payload, start)
-            if mo:
-                terminator = mo.start()
-                linesep = mo.group('sep')
-                if mo.end() < len(payload):
-                    # There's some post-MIME boundary epilogue
-                    epilogue = payload[mo.end():]
-            elif self._strict:
-                raise Errors.BoundaryError(
-                        "Couldn't find terminating boundary: %s" % boundary)
+            boundaryRE = re.compile(
+                    r'(?P<sep>' + re.escape(separator) + 
+                    r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
+            preamble, matchobj = fp.readuntil(boundaryRE)
+            if not matchobj:
+                # Broken - we hit the end of file. Just set the body 
+                # to the text.
+                container.set_payload(preamble)
+                return container
+            if preamble:
+                container.preamble = preamble
             else:
-                # Handle the case of no trailing boundary.  Check that it ends
-                # in a blank line.  Some cases (spamspamspam) don't even have
-                # that!
-                mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
-                if not mo:
-                    mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
-                    if not mo:
-                        raise Errors.BoundaryError(
-                          'No terminating boundary and no trailing empty line')
-                linesep = mo.group('sep')
-                terminator = len(payload)
-            # We split the textual payload on the boundary separator, which
-            # includes the trailing newline. If the container is a
-            # multipart/digest then the subparts are by default message/rfc822
-            # instead of text/plain.  In that case, they'll have a optional
-            # block of MIME headers, then an empty line followed by the
-            # message headers.
-            parts = re.split(
-                linesep + re.escape(separator) + r'[ \t]*' + linesep,
-                payload[start:terminator])
-            for part in parts:
+                # The module docs specify an empty preamble is None, not ''
+                container.preamble = None
+            while 1:
+                subobj = self._class()
                 if isdigest:
-                    if part.startswith(linesep):
-                        # There's no header block so create an empty message
-                        # object as the container, and lop off the newline so
-                        # we can parse the sub-subobject
-                        msgobj = self._class()
-                        part = part[len(linesep):]
+                    subobj.set_default_type('message/rfc822')
+                    firstline = fp.peekline()
+                    if firstline.strip():
+                        # we have MIME headers. all good. 
+                        self._parseheaders(subobj, fp)
                     else:
-                        parthdrs, part = part.split(linesep+linesep, 1)
-                        # msgobj in this case is the "message/rfc822" container
-                        msgobj = self.parsestr(parthdrs, headersonly=1)
-                    # while submsgobj is the message itself
-                    msgobj.set_default_type('message/rfc822')
-                    maintype = msgobj.get_content_maintype()
-                    if maintype in ('message', 'multipart'):
-                        submsgobj = self.parsestr(part)
-                        msgobj.attach(submsgobj)
-                    else:
-                        msgobj.set_payload(part)
+                        # no MIME headers. this is allowed for multipart/digest
+                        # Consume the extra blank line
+                        fp.readline()
+                        pass
                 else:
-                    msgobj = self.parsestr(part)
-                container.preamble = preamble
-                container.epilogue = epilogue
-                container.attach(msgobj)
-        elif container.get_main_type() == 'multipart':
+                    self._parseheaders(subobj, fp)
+                container.attach(subobj)
+                maintype = subobj.get_content_maintype()
+                hassubparts = (subobj.get_content_maintype() in 
+                                                ( "message", "multipart" ))
+                if hassubparts:
+                    subobj = self._parsemessage(subobj, fp)
+
+                trailer, matchobj = fp.readuntil(boundaryRE)
+                if matchobj is None or trailer:
+                    mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
+                    if not mo:
+                        mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
+                        if not mo:
+                            raise Errors.BoundaryError(
+                          'No terminating boundary and no trailing empty line')
+                    linesep = mo.group('sep')
+                    trailer = trailer[:-len(linesep)]
+                if trailer:
+                    self._attach_trailer(subobj, trailer)
+                if matchobj is None or matchobj.group('end'):
+                    # That was the last piece of data. Let our caller attach
+                    # the epilogue to us. But before we do that, push the
+                    # line ending of the match group back into the readline
+                    # buffer, as it's part of the epilogue.
+                    if matchobj:
+                        fp.unreadline(matchobj.group('linesep'))
+                    return container
+
+        elif container.get_content_maintype() == "multipart":
             # Very bad.  A message is a multipart with no boundary!
             raise Errors.BoundaryError(
-                'multipart message with no defined boundary')
-        elif container.get_type() == 'message/delivery-status':
-            # This special kind of type contains blocks of headers separated
-            # by a blank line.  We'll represent each header block as a
-            # separate Message object
-            blocks = []
-            while True:
-                blockmsg = self._class()
-                self._parseheaders(blockmsg, fp)
-                if not len(blockmsg):
-                    # No more header blocks left
-                    break
-                blocks.append(blockmsg)
-            container.set_payload(blocks)
-        elif container.get_main_type() == 'message':
-            # Create a container for the payload, but watch out for there not
-            # being any headers left
-            try:
-                msg = self.parse(fp)
-            except Errors.HeaderParseError:
+                    'multipart message with no defined boundary')
+        elif container.get_content_maintype() == "message":
+            ct = container.get_content_type()
+            if ct == "message/rfc822":
+                submessage = self._class()
+                self._parseheaders(submessage, fp)
+                self._parsemessage(submessage, fp)
+                container.attach(submessage)
+                return submessage
+            elif ct == "message/delivery-status":
+                # This special kind of type contains blocks of headers 
+                # separated by a blank line.  We'll represent each header 
+                # block as a separate Message object
+                while 1:
+                    nextblock = self._class()
+                    self._parseheaders(nextblock, fp)
+                    container.attach(nextblock)
+                    # next peek ahead to see whether we've hit the end or not
+                    nextline = fp.peekline()
+                    if nextline[:2] == "--":
+                        break
+                return container
+            else:
+                # Other sort of message object (e.g. external-body)
                 msg = self._class()
-                self._parsebody(msg, fp)
-            container.attach(msg)
+                self._parsemessage(msg, fp)
+                container.attach(msg)
+                return msg
         else:
-            text = fp.read()
-            if firstbodyline is not None:
-                text = firstbodyline + '\n' + text
-            container.set_payload(text)
+            # single body section. We let our caller set the payload.
+            return container
 
+    def _attach_trailer(self, obj, trailer):
+        if obj.get_content_maintype() in ("message", "multipart"):
+            obj.epilogue = trailer
+        else:
+            obj.set_payload(trailer)
 
 
 class HeaderParser(Parser):
@@ -284,9 +330,8 @@ class HeaderParser(Parser):
     Parsing with this subclass can be considerably faster if all you're
     interested in is the message headers.
     """
-    def _parsebody(self, container, fp, firstbodyline=None):
+    def _parsemessage(self, container, fp):
         # Consume but do not parse, the body
         text = fp.read()
-        if firstbodyline is not None:
-            text = firstbodyline + '\n' + text
         container.set_payload(text)
+        return None
-- 
cgit v0.12