# Copyright (C) 2001 Python Software Foundation # Author: barry@zope.com (Barry Warsaw) """A parser of RFC 2822 and MIME email messages. """ import re from cStringIO import StringIO # Intrapackage imports import Errors import Message bcre = re.compile('boundary="?([^"]+)"?', re.IGNORECASE) EMPTYSTRING = '' NL = '\n' class Parser: def __init__(self, _class=Message.Message): """Parser of RFC 2822 and MIME email messages. Creates an in-memory object tree representing the email message, which can then be manipulated and turned over to a Generator to return the textual representation of the message. The string must be formatted as a block of RFC 2822 headers and header continuation lines, optionally preceeded by a `Unix-from' header. The header block is terminated either by the end of the string or by a blank line. _class is the class to instantiate for new message objects when they must be created. This class must have a constructor that can take zero arguments. Default is Message.Message. """ self._class = _class def parse(self, fp): root = self._class() self._parseheaders(root, fp) self._parsebody(root, fp) return root def parsestr(self, text): return self.parse(StringIO(text)) def _parseheaders(self, container, fp): # Parse the headers, returning a list of header/value pairs. None as # the header means the Unix-From header. lastheader = '' lastvalue = [] lineno = 0 while 1: line = fp.readline()[:-1] if not line or not line.strip(): break lineno += 1 # Check for initial Unix From_ line if line.startswith('From '): if lineno == 1: container.set_unixfrom(line) continue else: raise Errors.HeaderParseError( 'Unix-from in headers after first rfc822 header') # # Header continuation line if line[0] in ' \t': if not lastheader: raise Errors.HeaderParseError( 'Continuation line seen before first header') lastvalue.append(line) continue # Normal, non-continuation header. BAW: this should check to make # sure it's a legal header, e.g. doesn't contain spaces. Also, we # should expose the header matching algorithm in the API, and # allow for a non-strict parsing mode (that ignores the line # instead of raising the exception). i = line.find(':') if i < 0: raise Errors.HeaderParseError( 'Not a header, not a continuation') if lastheader: container[lastheader] = NL.join(lastvalue) lastheader = line[:i] lastvalue = [line[i+1:].lstrip()] # Make sure we retain the last header if lastheader: container[lastheader] = NL.join(lastvalue) def _parsebody(self, container, fp): # Parse the body, but first split the payload on the content-type # boundary if present. boundary = isdigest = None ctype = container['content-type'] if ctype: mo = bcre.search(ctype) if mo: boundary = mo.group(1) isdigest = container.get_type() == 'multipart/digest' # If there's a boundary, split the payload text into its constituent # parts and parse each separately. Otherwise, just parse the rest of # the body as a single message. Note: any exceptions raised in the # recursive parse need to have their line numbers coerced. if boundary: preamble = epilogue = None # Split into subparts. The first boundary we're looking for won't # have the leading newline since we're at the start of the body # text. separator = '--' + boundary payload = fp.read() start = payload.find(separator) if start < 0: raise Errors.BoundaryError( "Couldn't find starting boundary: %s" % boundary) if start > 0: # there's some pre-MIME boundary preamble preamble = payload[0:start] start += len(separator) + 1 + isdigest terminator = payload.find('\n' + separator + '--', start) if terminator < 0: raise Errors.BoundaryError( "Couldn't find terminating boundary: %s" % boundary) if terminator+len(separator)+3 < len(payload): # there's some post-MIME boundary epilogue epilogue = payload[terminator+len(separator)+3:] # We split the textual payload on the boundary separator, which # includes the trailing newline. If the container is a # multipart/digest then the subparts are by default message/rfc822 # instead of text/plain. In that case, they'll have an extra # newline before the headers to distinguish the message's headers # from the subpart headers. if isdigest: separator += '\n\n' else: separator += '\n' parts = payload[start:terminator].split('\n' + separator) for part in parts: msgobj = self.parsestr(part) container.preamble = preamble container.epilogue = epilogue container.add_payload(msgobj) elif ctype == 'message/rfc822': # Create a container for the payload, but watch out for there not # being any headers left try: msg = self.parse(fp) except Errors.HeaderParseError: msg = self._class() self._parsebody(msg, fp) container.add_payload(msg) else: container.add_payload(fp.read())