diff options
Diffstat (limited to 'Lib/email/Parser.py')
-rw-r--r-- | Lib/email/Parser.py | 301 |
1 files changed, 20 insertions, 281 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py index 3fe1990..8c5661d 100644 --- a/Lib/email/Parser.py +++ b/Lib/email/Parser.py @@ -1,100 +1,20 @@ -# Copyright (C) 2001,2002 Python Software Foundation -# Author: barry@zope.com (Barry Warsaw) +# Copyright (C) 2001-2004 Python Software Foundation +# Author: Barry Warsaw, Thomas Wouters, Anthony Baxter +# Contact: email-sig@python.org -"""A parser of RFC 2822 and MIME email messages. -""" +"""A parser of RFC 2822 and MIME email messages.""" import re from cStringIO import StringIO -from types import ListType - -from email import Errors -from email import Message - -EMPTYSTRING = '' -NL = '\n' - -try: - True, False -except NameError: - True = 1 - False = 0 +from email.FeedParser import FeedParser +from email.Message import Message NLCRE = re.compile('\r\n|\r|\n') -class TextUtil: - """ A utility class for wrapping a file object and providing a - couple of additional useful functions. - """ - - def __init__(self, fp): - self.fp = fp - self.unread = [] - - def readline(self): - """ Return a line of data. - - If data has been pushed back with unreadline(), the most recently - returned unreadline()d data will be returned. - """ - if self.unread: - return self.unread.pop() - else: - return self.fp.readline() - - def unreadline(self, line): - """Push a line back into the object. - """ - self.unread.append(line) - - def peekline(self): - """Non-destructively look at the next line""" - line = self.readline() - self.unreadline(line) - return line - - def read(self): - """Return the remaining data - """ - r = self.fp.read() - if self.unread: - r = "\n".join(self.unread) + r - self.unread = [] - return r - - def readuntil(self, re, afterblank=0, includematch=0): - """Read a line at a time until we get the specified RE. - - Returns the text up to (and including, if includematch is true) the - matched text, and the RE match object. If afterblank is true, - there must be a blank line before the matched text. Moves current - filepointer to the line following the matched line. If we reach - end-of-file, return what we've got so far, and return None as the - RE match object. - """ - prematch = [] - blankseen = 0 - while 1: - line = self.readline() - if not line: - # end of file - return EMPTYSTRING.join(prematch), None - if afterblank: - if NLCRE.match(line): - blankseen = 1 - continue - else: - blankseen = 0 - m = re.match(line) - if (m and not afterblank) or (m and afterblank and blankseen): - if includematch: - prematch.append(line) - return EMPTYSTRING.join(prematch), m - prematch.append(line) class Parser: - def __init__(self, _class=Message.Message, strict=False): + def __init__(self, _class=Message, strict=False): """Parser of RFC 2822 and MIME email messages. Creates an in-memory object tree representing the email message, which @@ -117,7 +37,6 @@ class Parser: Default is non-strict parsing. """ self._class = _class - self._strict = strict def parse(self, fp, headersonly=False): """Create a message structure from the data in a file. @@ -127,15 +46,15 @@ class Parser: parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ - root = self._class() - fp = TextUtil(fp) - self._parseheaders(root, fp) - if not headersonly: - obj = self._parsemessage(root, fp) - trailer = fp.read() - if obj and trailer: - self._attach_trailer(obj, trailer) - return root + feedparser = FeedParser(self._class) + if headersonly: + feedparser._set_headersonly() + while True: + data = fp.read(8192) + if not data: + break + feedparser.feed(data) + return feedparser.close() def parsestr(self, text, headersonly=False): """Create a message structure from a string. @@ -147,191 +66,11 @@ class Parser: """ return self.parse(StringIO(text), headersonly=headersonly) - def _parseheaders(self, container, fp): - # Parse the headers, returning a list of header/value pairs. None as - # the header means the Unix-From header. - lastheader = '' - lastvalue = [] - lineno = 0 - while True: - # Don't strip the line before we test for the end condition, - # because whitespace-only header lines are RFC compliant - # continuation lines. - line = fp.readline() - if not line: - break - line = line.splitlines()[0] - if not line: - break - # Ignore the trailing newline - lineno += 1 - # Check for initial Unix From_ line - if line.startswith('From '): - if lineno == 1: - container.set_unixfrom(line) - continue - elif self._strict: - raise Errors.HeaderParseError( - 'Unix-from in headers after first rfc822 header') - else: - # ignore the wierdly placed From_ line - # XXX: maybe set unixfrom anyway? or only if not already? - continue - # Header continuation line - if line[0] in ' \t': - if not lastheader: - raise Errors.HeaderParseError( - 'Continuation line seen before first header') - lastvalue.append(line) - continue - # Normal, non-continuation header. BAW: this should check to make - # sure it's a legal header, e.g. doesn't contain spaces. Also, we - # should expose the header matching algorithm in the API, and - # allow for a non-strict parsing mode (that ignores the line - # instead of raising the exception). - i = line.find(':') - if i < 0: - if self._strict: - raise Errors.HeaderParseError( - "Not a header, not a continuation: ``%s''" % line) - elif lineno == 1 and line.startswith('--'): - # allow through duplicate boundary tags. - continue - else: - # There was no separating blank line as mandated by RFC - # 2822, but we're in non-strict mode. So just offer up - # this current line as the first body line. - fp.unreadline(line) - break - if lastheader: - container[lastheader] = NL.join(lastvalue) - lastheader = line[:i] - lastvalue = [line[i+1:].lstrip()] - # Make sure we retain the last header - if lastheader: - container[lastheader] = NL.join(lastvalue) - return - - def _parsemessage(self, container, fp): - # Parse the body. We walk through the body from top to bottom, - # keeping track of the current multipart nesting as we go. - # We return the object that gets the data at the end of this - # block. - boundary = container.get_boundary() - isdigest = (container.get_content_type() == 'multipart/digest') - if boundary: - separator = '--' + boundary - boundaryRE = re.compile( - r'(?P<sep>' + re.escape(separator) + - r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$') - preamble, matchobj = fp.readuntil(boundaryRE) - if not matchobj: - # Broken - we hit the end of file. Just set the body - # to the text. - container.set_payload(preamble) - return container - if preamble: - container.preamble = preamble - else: - # The module docs specify an empty preamble is None, not '' - container.preamble = None - while 1: - subobj = self._class() - if isdigest: - subobj.set_default_type('message/rfc822') - firstline = fp.peekline() - if firstline.strip(): - # we have MIME headers. all good. - self._parseheaders(subobj, fp) - else: - # no MIME headers. this is allowed for multipart/digest - # Consume the extra blank line - fp.readline() - pass - else: - self._parseheaders(subobj, fp) - container.attach(subobj) - maintype = subobj.get_content_maintype() - hassubparts = (subobj.get_content_maintype() in - ( "message", "multipart" )) - if hassubparts: - subobj = self._parsemessage(subobj, fp) - - trailer, matchobj = fp.readuntil(boundaryRE) - if matchobj is None or trailer: - mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer) - if not mo: - mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer) - if not mo: - raise Errors.BoundaryError( - 'No terminating boundary and no trailing empty line') - linesep = mo.group('sep') - trailer = trailer[:-len(linesep)] - if trailer: - self._attach_trailer(subobj, trailer) - if matchobj is None or matchobj.group('end'): - # That was the last piece of data. Let our caller attach - # the epilogue to us. But before we do that, push the - # line ending of the match group back into the readline - # buffer, as it's part of the epilogue. - if matchobj: - fp.unreadline(matchobj.group('linesep')) - return container - - elif container.get_content_maintype() == "multipart": - # Very bad. A message is a multipart with no boundary! - raise Errors.BoundaryError( - 'multipart message with no defined boundary') - elif container.get_content_maintype() == "message": - ct = container.get_content_type() - if ct == "message/rfc822": - submessage = self._class() - self._parseheaders(submessage, fp) - self._parsemessage(submessage, fp) - container.attach(submessage) - return submessage - elif ct == "message/delivery-status": - # This special kind of type contains blocks of headers - # separated by a blank line. We'll represent each header - # block as a separate Message object - while 1: - nextblock = self._class() - self._parseheaders(nextblock, fp) - container.attach(nextblock) - # next peek ahead to see whether we've hit the end or not - nextline = fp.peekline() - if nextline[:2] == "--": - break - return container - else: - # Other sort of message object (e.g. external-body) - msg = self._class() - self._parsemessage(msg, fp) - container.attach(msg) - return msg - else: - # single body section. We let our caller set the payload. - return container - - def _attach_trailer(self, obj, trailer): - if obj.get_content_maintype() in ("message", "multipart"): - obj.epilogue = trailer - else: - obj.set_payload(trailer) class HeaderParser(Parser): - """A subclass of Parser, this one only meaningfully parses message headers. - - This class can be used if all you're interested in is the headers of a - message. While it consumes the message body, it does not parse it, but - simply makes it available as a string payload. + def parse(self, fp, headersonly=True): + return Parser.parse(self, fp, True) - Parsing with this subclass can be considerably faster if all you're - interested in is the message headers. - """ - def _parsemessage(self, container, fp): - # Consume but do not parse, the body - text = fp.read() - container.set_payload(text) - return None + def parsestr(self, text, headersonly=True): + return Parser.parsestr(self, text, True) |