summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Wouters <thomas@python.org>2004-03-20 17:31:29 (GMT)
committerThomas Wouters <thomas@python.org>2004-03-20 17:31:29 (GMT)
commit0813d76cb09f4672bdf893fb4a6eef49a00593db (patch)
tree49a852db6b167fa6eaee9ae593ed8a6b49527955
parentd4079e1fc2382d717d8da914462bfc6026690118 (diff)
downloadcpython-0813d76cb09f4672bdf893fb4a6eef49a00593db.zip
cpython-0813d76cb09f4672bdf893fb4a6eef49a00593db.tar.gz
cpython-0813d76cb09f4672bdf893fb4a6eef49a00593db.tar.bz2
Merge in Anthony's new parser code, from the anthony-parser-branch:
> ---------------------------- > revision 1.20.4.4 > date: 2003/06/12 09:14:17; author: anthonybaxter; state: Exp; lines: +13 -6 > preamble is None when missing, not ''. > Handle a couple of bogus formatted messages - now parses my main testsuite. > Handle message/external-body. > ---------------------------- > revision 1.20.4.3 > date: 2003/06/12 07:16:40; author: anthonybaxter; state: Exp; lines: +6 -4 > epilogue-processing is now the same as the old parser - the newline at the > end of the line with the --endboundary-- is included as part of the epilogue. > Note that any whitespace after the boundary is _not_ part of the epilogue. > ---------------------------- > revision 1.20.4.2 > date: 2003/06/12 06:39:09; author: anthonybaxter; state: Exp; lines: +6 -4 > message/delivery-status fixed. > HeaderParser fixed. > ---------------------------- > revision 1.20.4.1 > date: 2003/06/12 06:08:56; author: anthonybaxter; state: Exp; lines: +163 -129 > A work-in-progress snapshot of the new parser. A couple of known problems: > > - first (blank) line of MIME epilogues is being consumed > - message/delivery-status isn't quite right > > It still needs a lot of cleanup, but right now it parses a whole lot of > badness that the old parser failed on. I also need to think about adding > back the old 'strict' flag in some way. > =============================================================================
-rw-r--r--Lib/email/Parser.py301
1 files changed, 173 insertions, 128 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py
index 09fac45..3fe1990 100644
--- a/Lib/email/Parser.py
+++ b/Lib/email/Parser.py
@@ -22,6 +22,75 @@ except NameError:
NLCRE = re.compile('\r\n|\r|\n')
+class TextUtil:
+ """ A utility class for wrapping a file object and providing a
+ couple of additional useful functions.
+ """
+
+ def __init__(self, fp):
+ self.fp = fp
+ self.unread = []
+
+ def readline(self):
+ """ Return a line of data.
+
+ If data has been pushed back with unreadline(), the most recently
+ returned unreadline()d data will be returned.
+ """
+ if self.unread:
+ return self.unread.pop()
+ else:
+ return self.fp.readline()
+
+ def unreadline(self, line):
+ """Push a line back into the object.
+ """
+ self.unread.append(line)
+
+ def peekline(self):
+ """Non-destructively look at the next line"""
+ line = self.readline()
+ self.unreadline(line)
+ return line
+
+ def read(self):
+ """Return the remaining data
+ """
+ r = self.fp.read()
+ if self.unread:
+ r = "\n".join(self.unread) + r
+ self.unread = []
+ return r
+
+ def readuntil(self, re, afterblank=0, includematch=0):
+ """Read a line at a time until we get the specified RE.
+
+ Returns the text up to (and including, if includematch is true) the
+ matched text, and the RE match object. If afterblank is true,
+ there must be a blank line before the matched text. Moves current
+ filepointer to the line following the matched line. If we reach
+ end-of-file, return what we've got so far, and return None as the
+ RE match object.
+ """
+ prematch = []
+ blankseen = 0
+ while 1:
+ line = self.readline()
+ if not line:
+ # end of file
+ return EMPTYSTRING.join(prematch), None
+ if afterblank:
+ if NLCRE.match(line):
+ blankseen = 1
+ continue
+ else:
+ blankseen = 0
+ m = re.match(line)
+ if (m and not afterblank) or (m and afterblank and blankseen):
+ if includematch:
+ prematch.append(line)
+ return EMPTYSTRING.join(prematch), m
+ prematch.append(line)
class Parser:
@@ -59,9 +128,13 @@ class Parser:
meaning it parses the entire contents of the file.
"""
root = self._class()
- firstbodyline = self._parseheaders(root, fp)
+ fp = TextUtil(fp)
+ self._parseheaders(root, fp)
if not headersonly:
- self._parsebody(root, fp, firstbodyline)
+ obj = self._parsemessage(root, fp)
+ trailer = fp.read()
+ if obj and trailer:
+ self._attach_trailer(obj, trailer)
return root
def parsestr(self, text, headersonly=False):
@@ -80,7 +153,6 @@ class Parser:
lastheader = ''
lastvalue = []
lineno = 0
- firstbodyline = None
while True:
# Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant
@@ -129,7 +201,7 @@ class Parser:
# There was no separating blank line as mandated by RFC
# 2822, but we're in non-strict mode. So just offer up
# this current line as the first body line.
- firstbodyline = line
+ fp.unreadline(line)
break
if lastheader:
container[lastheader] = NL.join(lastvalue)
@@ -138,140 +210,114 @@ class Parser:
# Make sure we retain the last header
if lastheader:
container[lastheader] = NL.join(lastvalue)
- return firstbodyline
+ return
- def _parsebody(self, container, fp, firstbodyline=None):
- # Parse the body, but first split the payload on the content-type
- # boundary if present.
+ def _parsemessage(self, container, fp):
+ # Parse the body. We walk through the body from top to bottom,
+ # keeping track of the current multipart nesting as we go.
+ # We return the object that gets the data at the end of this
+ # block.
boundary = container.get_boundary()
isdigest = (container.get_content_type() == 'multipart/digest')
- # If there's a boundary, split the payload text into its constituent
- # parts and parse each separately. Otherwise, just parse the rest of
- # the body as a single message. Note: any exceptions raised in the
- # recursive parse need to have their line numbers coerced.
- if boundary:
- preamble = epilogue = None
- # Split into subparts. The first boundary we're looking for won't
- # always have a leading newline since we're at the start of the
- # body text, and there's not always a preamble before the first
- # boundary.
+ if boundary:
separator = '--' + boundary
- payload = fp.read()
- if firstbodyline is not None:
- payload = firstbodyline + '\n' + payload
- # We use an RE here because boundaries can have trailing
- # whitespace.
- mo = re.search(
- r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
- payload)
- if not mo:
- if self._strict:
- raise Errors.BoundaryError(
- "Couldn't find starting boundary: %s" % boundary)
- container.set_payload(payload)
- return
- start = mo.start()
- if start > 0:
- # there's some pre-MIME boundary preamble
- preamble = payload[0:start]
- # Find out what kind of line endings we're using
- start += len(mo.group('sep')) + len(mo.group('ws'))
- mo = NLCRE.search(payload, start)
- if mo:
- start += len(mo.group(0))
- # We create a compiled regexp first because we need to be able to
- # specify the start position, and the module function doesn't
- # support this signature. :(
- cre = re.compile('(?P<sep>\r\n|\r|\n)' +
- re.escape(separator) + '--')
- mo = cre.search(payload, start)
- if mo:
- terminator = mo.start()
- linesep = mo.group('sep')
- if mo.end() < len(payload):
- # There's some post-MIME boundary epilogue
- epilogue = payload[mo.end():]
- elif self._strict:
- raise Errors.BoundaryError(
- "Couldn't find terminating boundary: %s" % boundary)
+ boundaryRE = re.compile(
+ r'(?P<sep>' + re.escape(separator) +
+ r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
+ preamble, matchobj = fp.readuntil(boundaryRE)
+ if not matchobj:
+ # Broken - we hit the end of file. Just set the body
+ # to the text.
+ container.set_payload(preamble)
+ return container
+ if preamble:
+ container.preamble = preamble
else:
- # Handle the case of no trailing boundary. Check that it ends
- # in a blank line. Some cases (spamspamspam) don't even have
- # that!
- mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
- if not mo:
- mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
- if not mo:
- raise Errors.BoundaryError(
- 'No terminating boundary and no trailing empty line')
- linesep = mo.group('sep')
- terminator = len(payload)
- # We split the textual payload on the boundary separator, which
- # includes the trailing newline. If the container is a
- # multipart/digest then the subparts are by default message/rfc822
- # instead of text/plain. In that case, they'll have a optional
- # block of MIME headers, then an empty line followed by the
- # message headers.
- parts = re.split(
- linesep + re.escape(separator) + r'[ \t]*' + linesep,
- payload[start:terminator])
- for part in parts:
+ # The module docs specify an empty preamble is None, not ''
+ container.preamble = None
+ while 1:
+ subobj = self._class()
if isdigest:
- if part.startswith(linesep):
- # There's no header block so create an empty message
- # object as the container, and lop off the newline so
- # we can parse the sub-subobject
- msgobj = self._class()
- part = part[len(linesep):]
+ subobj.set_default_type('message/rfc822')
+ firstline = fp.peekline()
+ if firstline.strip():
+ # we have MIME headers. all good.
+ self._parseheaders(subobj, fp)
else:
- parthdrs, part = part.split(linesep+linesep, 1)
- # msgobj in this case is the "message/rfc822" container
- msgobj = self.parsestr(parthdrs, headersonly=1)
- # while submsgobj is the message itself
- msgobj.set_default_type('message/rfc822')
- maintype = msgobj.get_content_maintype()
- if maintype in ('message', 'multipart'):
- submsgobj = self.parsestr(part)
- msgobj.attach(submsgobj)
- else:
- msgobj.set_payload(part)
+ # no MIME headers. this is allowed for multipart/digest
+ # Consume the extra blank line
+ fp.readline()
+ pass
else:
- msgobj = self.parsestr(part)
- container.preamble = preamble
- container.epilogue = epilogue
- container.attach(msgobj)
- elif container.get_main_type() == 'multipart':
+ self._parseheaders(subobj, fp)
+ container.attach(subobj)
+ maintype = subobj.get_content_maintype()
+ hassubparts = (subobj.get_content_maintype() in
+ ( "message", "multipart" ))
+ if hassubparts:
+ subobj = self._parsemessage(subobj, fp)
+
+ trailer, matchobj = fp.readuntil(boundaryRE)
+ if matchobj is None or trailer:
+ mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
+ if not mo:
+ mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
+ if not mo:
+ raise Errors.BoundaryError(
+ 'No terminating boundary and no trailing empty line')
+ linesep = mo.group('sep')
+ trailer = trailer[:-len(linesep)]
+ if trailer:
+ self._attach_trailer(subobj, trailer)
+ if matchobj is None or matchobj.group('end'):
+ # That was the last piece of data. Let our caller attach
+ # the epilogue to us. But before we do that, push the
+ # line ending of the match group back into the readline
+ # buffer, as it's part of the epilogue.
+ if matchobj:
+ fp.unreadline(matchobj.group('linesep'))
+ return container
+
+ elif container.get_content_maintype() == "multipart":
# Very bad. A message is a multipart with no boundary!
raise Errors.BoundaryError(
- 'multipart message with no defined boundary')
- elif container.get_type() == 'message/delivery-status':
- # This special kind of type contains blocks of headers separated
- # by a blank line. We'll represent each header block as a
- # separate Message object
- blocks = []
- while True:
- blockmsg = self._class()
- self._parseheaders(blockmsg, fp)
- if not len(blockmsg):
- # No more header blocks left
- break
- blocks.append(blockmsg)
- container.set_payload(blocks)
- elif container.get_main_type() == 'message':
- # Create a container for the payload, but watch out for there not
- # being any headers left
- try:
- msg = self.parse(fp)
- except Errors.HeaderParseError:
+ 'multipart message with no defined boundary')
+ elif container.get_content_maintype() == "message":
+ ct = container.get_content_type()
+ if ct == "message/rfc822":
+ submessage = self._class()
+ self._parseheaders(submessage, fp)
+ self._parsemessage(submessage, fp)
+ container.attach(submessage)
+ return submessage
+ elif ct == "message/delivery-status":
+ # This special kind of type contains blocks of headers
+ # separated by a blank line. We'll represent each header
+ # block as a separate Message object
+ while 1:
+ nextblock = self._class()
+ self._parseheaders(nextblock, fp)
+ container.attach(nextblock)
+ # next peek ahead to see whether we've hit the end or not
+ nextline = fp.peekline()
+ if nextline[:2] == "--":
+ break
+ return container
+ else:
+ # Other sort of message object (e.g. external-body)
msg = self._class()
- self._parsebody(msg, fp)
- container.attach(msg)
+ self._parsemessage(msg, fp)
+ container.attach(msg)
+ return msg
else:
- text = fp.read()
- if firstbodyline is not None:
- text = firstbodyline + '\n' + text
- container.set_payload(text)
+ # single body section. We let our caller set the payload.
+ return container
+ def _attach_trailer(self, obj, trailer):
+ if obj.get_content_maintype() in ("message", "multipart"):
+ obj.epilogue = trailer
+ else:
+ obj.set_payload(trailer)
class HeaderParser(Parser):
@@ -284,9 +330,8 @@ class HeaderParser(Parser):
Parsing with this subclass can be considerably faster if all you're
interested in is the message headers.
"""
- def _parsebody(self, container, fp, firstbodyline=None):
+ def _parsemessage(self, container, fp):
# Consume but do not parse, the body
text = fp.read()
- if firstbodyline is not None:
- text = firstbodyline + '\n' + text
container.set_payload(text)
+ return None