diff options
author | Thomas Wouters <thomas@python.org> | 2004-03-20 17:31:29 (GMT) |
---|---|---|
committer | Thomas Wouters <thomas@python.org> | 2004-03-20 17:31:29 (GMT) |
commit | 0813d76cb09f4672bdf893fb4a6eef49a00593db (patch) | |
tree | 49a852db6b167fa6eaee9ae593ed8a6b49527955 | |
parent | d4079e1fc2382d717d8da914462bfc6026690118 (diff) | |
download | cpython-0813d76cb09f4672bdf893fb4a6eef49a00593db.zip cpython-0813d76cb09f4672bdf893fb4a6eef49a00593db.tar.gz cpython-0813d76cb09f4672bdf893fb4a6eef49a00593db.tar.bz2 |
Merge in Anthony's new parser code, from the anthony-parser-branch:
> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17; author: anthonybaxter; state: Exp; lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40; author: anthonybaxter; state: Exp; lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09; author: anthonybaxter; state: Exp; lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56; author: anthonybaxter; state: Exp; lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================
-rw-r--r-- | Lib/email/Parser.py | 301 |
1 files changed, 173 insertions, 128 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py index 09fac45..3fe1990 100644 --- a/Lib/email/Parser.py +++ b/Lib/email/Parser.py @@ -22,6 +22,75 @@ except NameError: NLCRE = re.compile('\r\n|\r|\n') +class TextUtil: + """ A utility class for wrapping a file object and providing a + couple of additional useful functions. + """ + + def __init__(self, fp): + self.fp = fp + self.unread = [] + + def readline(self): + """ Return a line of data. + + If data has been pushed back with unreadline(), the most recently + returned unreadline()d data will be returned. + """ + if self.unread: + return self.unread.pop() + else: + return self.fp.readline() + + def unreadline(self, line): + """Push a line back into the object. + """ + self.unread.append(line) + + def peekline(self): + """Non-destructively look at the next line""" + line = self.readline() + self.unreadline(line) + return line + + def read(self): + """Return the remaining data + """ + r = self.fp.read() + if self.unread: + r = "\n".join(self.unread) + r + self.unread = [] + return r + + def readuntil(self, re, afterblank=0, includematch=0): + """Read a line at a time until we get the specified RE. + + Returns the text up to (and including, if includematch is true) the + matched text, and the RE match object. If afterblank is true, + there must be a blank line before the matched text. Moves current + filepointer to the line following the matched line. If we reach + end-of-file, return what we've got so far, and return None as the + RE match object. + """ + prematch = [] + blankseen = 0 + while 1: + line = self.readline() + if not line: + # end of file + return EMPTYSTRING.join(prematch), None + if afterblank: + if NLCRE.match(line): + blankseen = 1 + continue + else: + blankseen = 0 + m = re.match(line) + if (m and not afterblank) or (m and afterblank and blankseen): + if includematch: + prematch.append(line) + return EMPTYSTRING.join(prematch), m + prematch.append(line) class Parser: @@ -59,9 +128,13 @@ class Parser: meaning it parses the entire contents of the file. """ root = self._class() - firstbodyline = self._parseheaders(root, fp) + fp = TextUtil(fp) + self._parseheaders(root, fp) if not headersonly: - self._parsebody(root, fp, firstbodyline) + obj = self._parsemessage(root, fp) + trailer = fp.read() + if obj and trailer: + self._attach_trailer(obj, trailer) return root def parsestr(self, text, headersonly=False): @@ -80,7 +153,6 @@ class Parser: lastheader = '' lastvalue = [] lineno = 0 - firstbodyline = None while True: # Don't strip the line before we test for the end condition, # because whitespace-only header lines are RFC compliant @@ -129,7 +201,7 @@ class Parser: # There was no separating blank line as mandated by RFC # 2822, but we're in non-strict mode. So just offer up # this current line as the first body line. - firstbodyline = line + fp.unreadline(line) break if lastheader: container[lastheader] = NL.join(lastvalue) @@ -138,140 +210,114 @@ class Parser: # Make sure we retain the last header if lastheader: container[lastheader] = NL.join(lastvalue) - return firstbodyline + return - def _parsebody(self, container, fp, firstbodyline=None): - # Parse the body, but first split the payload on the content-type - # boundary if present. + def _parsemessage(self, container, fp): + # Parse the body. We walk through the body from top to bottom, + # keeping track of the current multipart nesting as we go. + # We return the object that gets the data at the end of this + # block. boundary = container.get_boundary() isdigest = (container.get_content_type() == 'multipart/digest') - # If there's a boundary, split the payload text into its constituent - # parts and parse each separately. Otherwise, just parse the rest of - # the body as a single message. Note: any exceptions raised in the - # recursive parse need to have their line numbers coerced. - if boundary: - preamble = epilogue = None - # Split into subparts. The first boundary we're looking for won't - # always have a leading newline since we're at the start of the - # body text, and there's not always a preamble before the first - # boundary. + if boundary: separator = '--' + boundary - payload = fp.read() - if firstbodyline is not None: - payload = firstbodyline + '\n' + payload - # We use an RE here because boundaries can have trailing - # whitespace. - mo = re.search( - r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)', - payload) - if not mo: - if self._strict: - raise Errors.BoundaryError( - "Couldn't find starting boundary: %s" % boundary) - container.set_payload(payload) - return - start = mo.start() - if start > 0: - # there's some pre-MIME boundary preamble - preamble = payload[0:start] - # Find out what kind of line endings we're using - start += len(mo.group('sep')) + len(mo.group('ws')) - mo = NLCRE.search(payload, start) - if mo: - start += len(mo.group(0)) - # We create a compiled regexp first because we need to be able to - # specify the start position, and the module function doesn't - # support this signature. :( - cre = re.compile('(?P<sep>\r\n|\r|\n)' + - re.escape(separator) + '--') - mo = cre.search(payload, start) - if mo: - terminator = mo.start() - linesep = mo.group('sep') - if mo.end() < len(payload): - # There's some post-MIME boundary epilogue - epilogue = payload[mo.end():] - elif self._strict: - raise Errors.BoundaryError( - "Couldn't find terminating boundary: %s" % boundary) + boundaryRE = re.compile( + r'(?P<sep>' + re.escape(separator) + + r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$') + preamble, matchobj = fp.readuntil(boundaryRE) + if not matchobj: + # Broken - we hit the end of file. Just set the body + # to the text. + container.set_payload(preamble) + return container + if preamble: + container.preamble = preamble else: - # Handle the case of no trailing boundary. Check that it ends - # in a blank line. Some cases (spamspamspam) don't even have - # that! - mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload) - if not mo: - mo = re.search('(?P<sep>\r\n|\r|\n)$', payload) - if not mo: - raise Errors.BoundaryError( - 'No terminating boundary and no trailing empty line') - linesep = mo.group('sep') - terminator = len(payload) - # We split the textual payload on the boundary separator, which - # includes the trailing newline. If the container is a - # multipart/digest then the subparts are by default message/rfc822 - # instead of text/plain. In that case, they'll have a optional - # block of MIME headers, then an empty line followed by the - # message headers. - parts = re.split( - linesep + re.escape(separator) + r'[ \t]*' + linesep, - payload[start:terminator]) - for part in parts: + # The module docs specify an empty preamble is None, not '' + container.preamble = None + while 1: + subobj = self._class() if isdigest: - if part.startswith(linesep): - # There's no header block so create an empty message - # object as the container, and lop off the newline so - # we can parse the sub-subobject - msgobj = self._class() - part = part[len(linesep):] + subobj.set_default_type('message/rfc822') + firstline = fp.peekline() + if firstline.strip(): + # we have MIME headers. all good. + self._parseheaders(subobj, fp) else: - parthdrs, part = part.split(linesep+linesep, 1) - # msgobj in this case is the "message/rfc822" container - msgobj = self.parsestr(parthdrs, headersonly=1) - # while submsgobj is the message itself - msgobj.set_default_type('message/rfc822') - maintype = msgobj.get_content_maintype() - if maintype in ('message', 'multipart'): - submsgobj = self.parsestr(part) - msgobj.attach(submsgobj) - else: - msgobj.set_payload(part) + # no MIME headers. this is allowed for multipart/digest + # Consume the extra blank line + fp.readline() + pass else: - msgobj = self.parsestr(part) - container.preamble = preamble - container.epilogue = epilogue - container.attach(msgobj) - elif container.get_main_type() == 'multipart': + self._parseheaders(subobj, fp) + container.attach(subobj) + maintype = subobj.get_content_maintype() + hassubparts = (subobj.get_content_maintype() in + ( "message", "multipart" )) + if hassubparts: + subobj = self._parsemessage(subobj, fp) + + trailer, matchobj = fp.readuntil(boundaryRE) + if matchobj is None or trailer: + mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer) + if not mo: + mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer) + if not mo: + raise Errors.BoundaryError( + 'No terminating boundary and no trailing empty line') + linesep = mo.group('sep') + trailer = trailer[:-len(linesep)] + if trailer: + self._attach_trailer(subobj, trailer) + if matchobj is None or matchobj.group('end'): + # That was the last piece of data. Let our caller attach + # the epilogue to us. But before we do that, push the + # line ending of the match group back into the readline + # buffer, as it's part of the epilogue. + if matchobj: + fp.unreadline(matchobj.group('linesep')) + return container + + elif container.get_content_maintype() == "multipart": # Very bad. A message is a multipart with no boundary! raise Errors.BoundaryError( - 'multipart message with no defined boundary') - elif container.get_type() == 'message/delivery-status': - # This special kind of type contains blocks of headers separated - # by a blank line. We'll represent each header block as a - # separate Message object - blocks = [] - while True: - blockmsg = self._class() - self._parseheaders(blockmsg, fp) - if not len(blockmsg): - # No more header blocks left - break - blocks.append(blockmsg) - container.set_payload(blocks) - elif container.get_main_type() == 'message': - # Create a container for the payload, but watch out for there not - # being any headers left - try: - msg = self.parse(fp) - except Errors.HeaderParseError: + 'multipart message with no defined boundary') + elif container.get_content_maintype() == "message": + ct = container.get_content_type() + if ct == "message/rfc822": + submessage = self._class() + self._parseheaders(submessage, fp) + self._parsemessage(submessage, fp) + container.attach(submessage) + return submessage + elif ct == "message/delivery-status": + # This special kind of type contains blocks of headers + # separated by a blank line. We'll represent each header + # block as a separate Message object + while 1: + nextblock = self._class() + self._parseheaders(nextblock, fp) + container.attach(nextblock) + # next peek ahead to see whether we've hit the end or not + nextline = fp.peekline() + if nextline[:2] == "--": + break + return container + else: + # Other sort of message object (e.g. external-body) msg = self._class() - self._parsebody(msg, fp) - container.attach(msg) + self._parsemessage(msg, fp) + container.attach(msg) + return msg else: - text = fp.read() - if firstbodyline is not None: - text = firstbodyline + '\n' + text - container.set_payload(text) + # single body section. We let our caller set the payload. + return container + def _attach_trailer(self, obj, trailer): + if obj.get_content_maintype() in ("message", "multipart"): + obj.epilogue = trailer + else: + obj.set_payload(trailer) class HeaderParser(Parser): @@ -284,9 +330,8 @@ class HeaderParser(Parser): Parsing with this subclass can be considerably faster if all you're interested in is the message headers. """ - def _parsebody(self, container, fp, firstbodyline=None): + def _parsemessage(self, container, fp): # Consume but do not parse, the body text = fp.read() - if firstbodyline is not None: - text = firstbodyline + '\n' + text container.set_payload(text) + return None |