diff options
Diffstat (limited to 'Lib/email/Parser.py')
| -rw-r--r-- | Lib/email/Parser.py | 154 | 
1 files changed, 154 insertions, 0 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py new file mode 100644 index 0000000..cc23d19 --- /dev/null +++ b/Lib/email/Parser.py @@ -0,0 +1,154 @@ +# Copyright (C) 2001 Python Software Foundation +# Author: barry@zope.com (Barry Warsaw) + +"""A parser of RFC 2822 and MIME email messages. +""" + +import re +from cStringIO import StringIO + +# Intrapackage imports +import Errors +import Message + +bcre = re.compile('boundary="?([^"]+)"?', re.IGNORECASE) +EMPTYSTRING = '' +NL = '\n' + + + +class Parser: +    def __init__(self, _class=Message.Message): +        """Parser of RFC 2822 and MIME email messages. + +        Creates an in-memory object tree representing the email message, which +        can then be manipulated and turned over to a Generator to return the +        textual representation of the message. + +        The string must be formatted as a block of RFC 2822 headers and header +        continuation lines, optionally preceeded by a `Unix-from' header.  The +        header block is terminated either by the end of the string or by a +        blank line. + +        _class is the class to instantiate for new message objects when they +        must be created.  This class must have a constructor that can take +        zero arguments.  Default is Message.Message. +        """ +        self._class = _class + +    def parse(self, fp): +        root = self._class() +        self._parseheaders(root, fp) +        self._parsebody(root, fp) +        return root + +    def parsestr(self, text): +        return self.parse(StringIO(text)) + +    def _parseheaders(self, container, fp): +        # Parse the headers, returning a list of header/value pairs.  None as +        # the header means the Unix-From header. +        lastheader = '' +        lastvalue = [] +        lineno = 0 +        while 1: +            line = fp.readline()[:-1] +            if not line or not line.strip(): +                break +            lineno += 1 +            # Check for initial Unix From_ line +            if line.startswith('From '): +                if lineno == 1: +                    container.set_unixfrom(line) +                    continue +                else: +                    raise Errors.HeaderParseError( +                        'Unix-from in headers after first rfc822 header') +            # +            # Header continuation line +            if line[0] in ' \t': +                if not lastheader: +                    raise Errors.HeaderParseError( +                        'Continuation line seen before first header') +                lastvalue.append(line) +                continue +            # Normal, non-continuation header.  BAW: this should check to make +            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we +            # should expose the header matching algorithm in the API, and +            # allow for a non-strict parsing mode (that ignores the line +            # instead of raising the exception). +            i = line.find(':') +            if i < 0: +                raise Errors.HeaderParseError( +                    'Not a header, not a continuation') +            if lastheader: +                container[lastheader] = NL.join(lastvalue) +            lastheader = line[:i] +            lastvalue = [line[i+1:].lstrip()] +        # Make sure we retain the last header +        if lastheader: +            container[lastheader] = NL.join(lastvalue) + +    def _parsebody(self, container, fp): +        # Parse the body, but first split the payload on the content-type +        # boundary if present. +        boundary = isdigest = None +        ctype = container['content-type'] +        if ctype: +            mo = bcre.search(ctype) +            if mo: +                boundary = mo.group(1) +            isdigest = container.get_type() == 'multipart/digest' +        # If there's a boundary, split the payload text into its constituent +        # parts and parse each separately.  Otherwise, just parse the rest of +        # the body as a single message.  Note: any exceptions raised in the +        # recursive parse need to have their line numbers coerced. +        if boundary: +            preamble = epilogue = None +            # Split into subparts.  The first boundary we're looking for won't +            # have the leading newline since we're at the start of the body +            # text. +            separator = '--' + boundary +            payload = fp.read() +            start = payload.find(separator) +            if start < 0: +                raise Errors.BoundaryError( +                    "Couldn't find starting boundary: %s" % boundary) +            if start > 0: +                # there's some pre-MIME boundary preamble +                preamble = payload[0:start] +            start += len(separator) + 1 + isdigest +            terminator = payload.find('\n' + separator + '--', start) +            if terminator < 0: +                raise Errors.BoundaryError( +                    "Couldn't find terminating boundary: %s" % boundary) +            if terminator+len(separator)+3 < len(payload): +                # there's some post-MIME boundary epilogue +                epilogue = payload[terminator+len(separator)+3:] +            # We split the textual payload on the boundary separator, which +            # includes the trailing newline.  If the container is a +            # multipart/digest then the subparts are by default message/rfc822 +            # instead of text/plain.  In that case, they'll have an extra +            # newline before the headers to distinguish the message's headers +            # from the subpart headers. +            if isdigest: +                separator += '\n\n' +            else: +                separator += '\n' +            parts = payload[start:terminator].split('\n' + separator) +            for part in parts: +                msgobj = self.parsestr(part) +                container.preamble = preamble +                container.epilogue = epilogue +                container.add_payload(msgobj) +        elif ctype == 'message/rfc822': +            # Create a container for the payload, but watch out for there not +            # being any headers left +            try: +                msg = self.parse(fp) +            except Errors.HeaderParseError: +                msg = self._class() +                self._parsebody(msg, fp) +            container.add_payload(msg) +        else: +            container.add_payload(fp.read())  | 
