"""Generic MIME parser. Classes: MimeParser - Generic MIME parser. Exceptions: MimeError - Exception raised by MimeParser class. XXX To do: - Content-transfer-encoding issues - Use Content-length header in rawbody()? - Cache parts instead of reparsing each time - The message strings in exceptions could use some work """ from types import * # Python types, not MIME types :-) import string import regex import SubFile import mimetools MimeError = "MimeParser.MimeError" # Exception raised by this class class MimeParser: """Generic MIME parser. This requires a seekable file. """ def __init__(self, fp): """Constructor: store the file pointer and parse the headers.""" self._fp = fp self._start = fp.tell() self._headers = h = mimetools.Message(fp) self._bodystart = fp.tell() self._multipart = h.getmaintype() == 'multipart' def multipart(self): """Return whether this is a multipart message.""" return self._multipart def headers(self): """Return the headers of the MIME message, as a Message object.""" return self._headers def rawbody(self): """Return the raw body of the MIME message, as a file-like object. This is a fairly low-level interface -- for a multipart message, you'd have to parse the body yourself, and it doesn't translate the Content-transfer-encoding. """ # XXX Use Content-length to set end if it exists? return SubFile.SubFile(self._fp, self._bodystart) def body(self): """Return the body of a 1-part MIME message, as a file-like object. This should interpret the Content-transfer-encoding, if any (XXX currently it doesn't). """ if self._multipart: raise MimeError, "body() only works for 1-part messages" return self.rawbody() _re_content_length = regex.compile('content-length:[ \t]*\([0-9]+\)', regex.casefold) def rawparts(self): """Return the raw body parts of a multipart MIME message. This returns a list of SubFile() objects corresponding to the parts. Note that the phantom part before the first separator is returned too, as list item 0. If the final part is not followed by a terminator, it is ignored, and this error is not reported. (XXX: the error should be raised). """ if not self._multipart: raise MimeError, "[raw]parts() only works for multipart messages" h = self._headers separator = h.getparam('boundary') if not separator: raise MimeError, "multipart boundary not specified" separator = "--" + separator terminator = separator + "--" ns = len(separator) list = [] f = self._fp start = f.tell() clength = -1 bodystart = -1 inheaders = 0 while 1: end = f.tell() line = f.readline() if not line: break if line[:2] != "--" or line[:ns] != separator: if inheaders: re = self._re_content_length if re.match(line) > 0: try: clength = string.atoi(re.group(1)) except string.atoi_error: pass if not string.strip(line): inheaders = 0 bodystart = f.tell() if clength > 0: # Skip binary data f.read(clength) continue line = string.strip(line) if line == terminator or line == separator: if clength >= 0: # The Content-length header determines the subfile size end = bodystart + clength else: # The final newline is not part of the content end = end-1 list.append(SubFile.SubFile(f, start, end)) start = f.tell() clength = -1 inheaders = 1 if line == terminator: break return list def parts(self): """Return the parsed body parts of a multipart MIME message. This returns a list of MimeParser() instances corresponding to the parts. The phantom part before the first separator is not included. """ return map(MimeParser, self.rawparts()[1:]) def getsubpartbyposition(self, indices): part = self for i in indices: part = part.parts()[i] return part def getsubpartbyid(self, id): h = self._headers cid = h.getheader('content-id') if cid and cid == id: return self if self._multipart: for part in self.parts(): parser = MimeParser(part) hit = parser.getsubpartbyid(id) if hit: return hit return None def index(self): """Return an index of the MIME file. This parses the entire file and returns index information about it, in the form of a tuple (ctype, headers, body) where 'ctype' is the content type string of the message (e.g. `text/plain' or `multipart/mixed') and 'headers' is a Message instance containing the message headers (which should be treated as read-only). The 'body' item depends on the content type: - If it is an atomic message (anything except for content type multipart/*), it is the file-like object returned by self.body(). - For a content type of multipart/*, it is the list of MimeParser() objects returned by self.parts(). """ if self._multipart: body = self.parts() else: body = self.body() return self._headers.gettype(), self._headers, body def _show(parser, level=0): """Helper for _test().""" ctype, headers, body = parser.index() print ctype, if type(body) == ListType: nparts = len(body) print "(%d part%s):" % (nparts, nparts != 1 and "s" or "") n = 0 for part in body: n = n+1 print "%*d." % (4*level+2, n), _show(part, level+1) else: bodylines = body.readlines() print "(%d header lines, %d body lines)" % ( len(headers.headers), len(bodylines)) for line in headers.headers + ['\n'] + bodylines: if line[-1:] == '\n': line = line[:-1] print " "*level + line def _test(args = None): """Test program invoked when run as a script. When a filename argument is specified, it reads from that file. When no arguments are present, it defaults to 'testkp.txt' if it exists, else it defaults to stdin. """ if not args: import sys args = sys.argv[1:] if args: fn = args[0] else: import os fn = 'testkp.txt' if not os.path.exists(fn): fn = '-' if fn == '-': fp = sys.stdin else: fp = open(fn) mp = MimeParser(fp) _show(mp) if __name__ == '__main__': import sys _test()