summaryrefslogtreecommitdiffstats
path: root/Lib/email/Parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/email/Parser.py')
-rw-r--r--Lib/email/Parser.py154
1 files changed, 154 insertions, 0 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py
new file mode 100644
index 0000000..cc23d19
--- /dev/null
+++ b/Lib/email/Parser.py
@@ -0,0 +1,154 @@
+# Copyright (C) 2001 Python Software Foundation
+# Author: barry@zope.com (Barry Warsaw)
+
+"""A parser of RFC 2822 and MIME email messages.
+"""
+
+import re
+from cStringIO import StringIO
+
+# Intrapackage imports
+import Errors
+import Message
+
+bcre = re.compile('boundary="?([^"]+)"?', re.IGNORECASE)
+EMPTYSTRING = ''
+NL = '\n'
+
+
+
+class Parser:
+ def __init__(self, _class=Message.Message):
+ """Parser of RFC 2822 and MIME email messages.
+
+ Creates an in-memory object tree representing the email message, which
+ can then be manipulated and turned over to a Generator to return the
+ textual representation of the message.
+
+ The string must be formatted as a block of RFC 2822 headers and header
+ continuation lines, optionally preceeded by a `Unix-from' header. The
+ header block is terminated either by the end of the string or by a
+ blank line.
+
+ _class is the class to instantiate for new message objects when they
+ must be created. This class must have a constructor that can take
+ zero arguments. Default is Message.Message.
+ """
+ self._class = _class
+
+ def parse(self, fp):
+ root = self._class()
+ self._parseheaders(root, fp)
+ self._parsebody(root, fp)
+ return root
+
+ def parsestr(self, text):
+ return self.parse(StringIO(text))
+
+ def _parseheaders(self, container, fp):
+ # Parse the headers, returning a list of header/value pairs. None as
+ # the header means the Unix-From header.
+ lastheader = ''
+ lastvalue = []
+ lineno = 0
+ while 1:
+ line = fp.readline()[:-1]
+ if not line or not line.strip():
+ break
+ lineno += 1
+ # Check for initial Unix From_ line
+ if line.startswith('From '):
+ if lineno == 1:
+ container.set_unixfrom(line)
+ continue
+ else:
+ raise Errors.HeaderParseError(
+ 'Unix-from in headers after first rfc822 header')
+ #
+ # Header continuation line
+ if line[0] in ' \t':
+ if not lastheader:
+ raise Errors.HeaderParseError(
+ 'Continuation line seen before first header')
+ lastvalue.append(line)
+ continue
+ # Normal, non-continuation header. BAW: this should check to make
+ # sure it's a legal header, e.g. doesn't contain spaces. Also, we
+ # should expose the header matching algorithm in the API, and
+ # allow for a non-strict parsing mode (that ignores the line
+ # instead of raising the exception).
+ i = line.find(':')
+ if i < 0:
+ raise Errors.HeaderParseError(
+ 'Not a header, not a continuation')
+ if lastheader:
+ container[lastheader] = NL.join(lastvalue)
+ lastheader = line[:i]
+ lastvalue = [line[i+1:].lstrip()]
+ # Make sure we retain the last header
+ if lastheader:
+ container[lastheader] = NL.join(lastvalue)
+
+ def _parsebody(self, container, fp):
+ # Parse the body, but first split the payload on the content-type
+ # boundary if present.
+ boundary = isdigest = None
+ ctype = container['content-type']
+ if ctype:
+ mo = bcre.search(ctype)
+ if mo:
+ boundary = mo.group(1)
+ isdigest = container.get_type() == 'multipart/digest'
+ # If there's a boundary, split the payload text into its constituent
+ # parts and parse each separately. Otherwise, just parse the rest of
+ # the body as a single message. Note: any exceptions raised in the
+ # recursive parse need to have their line numbers coerced.
+ if boundary:
+ preamble = epilogue = None
+ # Split into subparts. The first boundary we're looking for won't
+ # have the leading newline since we're at the start of the body
+ # text.
+ separator = '--' + boundary
+ payload = fp.read()
+ start = payload.find(separator)
+ if start < 0:
+ raise Errors.BoundaryError(
+ "Couldn't find starting boundary: %s" % boundary)
+ if start > 0:
+ # there's some pre-MIME boundary preamble
+ preamble = payload[0:start]
+ start += len(separator) + 1 + isdigest
+ terminator = payload.find('\n' + separator + '--', start)
+ if terminator < 0:
+ raise Errors.BoundaryError(
+ "Couldn't find terminating boundary: %s" % boundary)
+ if terminator+len(separator)+3 < len(payload):
+ # there's some post-MIME boundary epilogue
+ epilogue = payload[terminator+len(separator)+3:]
+ # We split the textual payload on the boundary separator, which
+ # includes the trailing newline. If the container is a
+ # multipart/digest then the subparts are by default message/rfc822
+ # instead of text/plain. In that case, they'll have an extra
+ # newline before the headers to distinguish the message's headers
+ # from the subpart headers.
+ if isdigest:
+ separator += '\n\n'
+ else:
+ separator += '\n'
+ parts = payload[start:terminator].split('\n' + separator)
+ for part in parts:
+ msgobj = self.parsestr(part)
+ container.preamble = preamble
+ container.epilogue = epilogue
+ container.add_payload(msgobj)
+ elif ctype == 'message/rfc822':
+ # Create a container for the payload, but watch out for there not
+ # being any headers left
+ try:
+ msg = self.parse(fp)
+ except Errors.HeaderParseError:
+ msg = self._class()
+ self._parsebody(msg, fp)
+ container.add_payload(msg)
+ else:
+ container.add_payload(fp.read())