diff options
Diffstat (limited to 'Lib/email/Parser.py')
-rw-r--r-- | Lib/email/Parser.py | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py new file mode 100644 index 0000000..cc23d19 --- /dev/null +++ b/Lib/email/Parser.py @@ -0,0 +1,154 @@ +# Copyright (C) 2001 Python Software Foundation +# Author: barry@zope.com (Barry Warsaw) + +"""A parser of RFC 2822 and MIME email messages. +""" + +import re +from cStringIO import StringIO + +# Intrapackage imports +import Errors +import Message + +bcre = re.compile('boundary="?([^"]+)"?', re.IGNORECASE) +EMPTYSTRING = '' +NL = '\n' + + + +class Parser: + def __init__(self, _class=Message.Message): + """Parser of RFC 2822 and MIME email messages. + + Creates an in-memory object tree representing the email message, which + can then be manipulated and turned over to a Generator to return the + textual representation of the message. + + The string must be formatted as a block of RFC 2822 headers and header + continuation lines, optionally preceeded by a `Unix-from' header. The + header block is terminated either by the end of the string or by a + blank line. + + _class is the class to instantiate for new message objects when they + must be created. This class must have a constructor that can take + zero arguments. Default is Message.Message. + """ + self._class = _class + + def parse(self, fp): + root = self._class() + self._parseheaders(root, fp) + self._parsebody(root, fp) + return root + + def parsestr(self, text): + return self.parse(StringIO(text)) + + def _parseheaders(self, container, fp): + # Parse the headers, returning a list of header/value pairs. None as + # the header means the Unix-From header. + lastheader = '' + lastvalue = [] + lineno = 0 + while 1: + line = fp.readline()[:-1] + if not line or not line.strip(): + break + lineno += 1 + # Check for initial Unix From_ line + if line.startswith('From '): + if lineno == 1: + container.set_unixfrom(line) + continue + else: + raise Errors.HeaderParseError( + 'Unix-from in headers after first rfc822 header') + # + # Header continuation line + if line[0] in ' \t': + if not lastheader: + raise Errors.HeaderParseError( + 'Continuation line seen before first header') + lastvalue.append(line) + continue + # Normal, non-continuation header. BAW: this should check to make + # sure it's a legal header, e.g. doesn't contain spaces. Also, we + # should expose the header matching algorithm in the API, and + # allow for a non-strict parsing mode (that ignores the line + # instead of raising the exception). + i = line.find(':') + if i < 0: + raise Errors.HeaderParseError( + 'Not a header, not a continuation') + if lastheader: + container[lastheader] = NL.join(lastvalue) + lastheader = line[:i] + lastvalue = [line[i+1:].lstrip()] + # Make sure we retain the last header + if lastheader: + container[lastheader] = NL.join(lastvalue) + + def _parsebody(self, container, fp): + # Parse the body, but first split the payload on the content-type + # boundary if present. + boundary = isdigest = None + ctype = container['content-type'] + if ctype: + mo = bcre.search(ctype) + if mo: + boundary = mo.group(1) + isdigest = container.get_type() == 'multipart/digest' + # If there's a boundary, split the payload text into its constituent + # parts and parse each separately. Otherwise, just parse the rest of + # the body as a single message. Note: any exceptions raised in the + # recursive parse need to have their line numbers coerced. + if boundary: + preamble = epilogue = None + # Split into subparts. The first boundary we're looking for won't + # have the leading newline since we're at the start of the body + # text. + separator = '--' + boundary + payload = fp.read() + start = payload.find(separator) + if start < 0: + raise Errors.BoundaryError( + "Couldn't find starting boundary: %s" % boundary) + if start > 0: + # there's some pre-MIME boundary preamble + preamble = payload[0:start] + start += len(separator) + 1 + isdigest + terminator = payload.find('\n' + separator + '--', start) + if terminator < 0: + raise Errors.BoundaryError( + "Couldn't find terminating boundary: %s" % boundary) + if terminator+len(separator)+3 < len(payload): + # there's some post-MIME boundary epilogue + epilogue = payload[terminator+len(separator)+3:] + # We split the textual payload on the boundary separator, which + # includes the trailing newline. If the container is a + # multipart/digest then the subparts are by default message/rfc822 + # instead of text/plain. In that case, they'll have an extra + # newline before the headers to distinguish the message's headers + # from the subpart headers. + if isdigest: + separator += '\n\n' + else: + separator += '\n' + parts = payload[start:terminator].split('\n' + separator) + for part in parts: + msgobj = self.parsestr(part) + container.preamble = preamble + container.epilogue = epilogue + container.add_payload(msgobj) + elif ctype == 'message/rfc822': + # Create a container for the payload, but watch out for there not + # being any headers left + try: + msg = self.parse(fp) + except Errors.HeaderParseError: + msg = self._class() + self._parsebody(msg, fp) + container.add_payload(msg) + else: + container.add_payload(fp.read()) |