From da2525ed2a9c8ee6219b469fbbae47b9cec2c4ba Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Tue, 5 Nov 2002 21:44:06 +0000 Subject: parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527, where in lax parsing, the first non-header line after a header block (e.g. the first line not containing a colon, and not a continuation), can be treated as the first body line, even without the RFC mandated blank line separator. rfc822 had this behavior, and I vaguely remember problems with this, but can't remember details. In any event, all the tests still pass, so I guess we'll find out. ;/ This patch works by returning the non-header, non-continuation line from _parseheader() and using that as the first header line prepended to fp.read() if given. It's usually None. We use this approach instead of trying to seek/tell the file-like object. --- Lib/email/Parser.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py index 6dfa4d3..8e0ac44 100644 --- a/Lib/email/Parser.py +++ b/Lib/email/Parser.py @@ -59,9 +59,9 @@ class Parser: meaning it parses the entire contents of the file. """ root = self._class() - self._parseheaders(root, fp) + firstbodyline = self._parseheaders(root, fp) if not headersonly: - self._parsebody(root, fp) + self._parsebody(root, fp, firstbodyline) return root def parsestr(self, text, headersonly=False): @@ -80,6 +80,7 @@ class Parser: lastheader = '' lastvalue = [] lineno = 0 + firstbodyline = None while True: # Don't strip the line before we test for the end condition, # because whitespace-only header lines are RFC compliant @@ -120,13 +121,16 @@ class Parser: if i < 0: if self._strict: raise Errors.HeaderParseError( - "Not a header, not a continuation: ``%s''"%line) + "Not a header, not a continuation: ``%s''" % line) elif lineno == 1 and line.startswith('--'): # allow through duplicate boundary tags. continue else: - raise Errors.HeaderParseError( - "Not a header, not a continuation: ``%s''"%line) + # There was no separating blank line as mandated by RFC + # 2822, but we're in non-strict mode. So just offer up + # this current line as the first body line. + firstbodyline = line + break if lastheader: container[lastheader] = NL.join(lastvalue) lastheader = line[:i] @@ -134,8 +138,9 @@ class Parser: # Make sure we retain the last header if lastheader: container[lastheader] = NL.join(lastvalue) + return firstbodyline - def _parsebody(self, container, fp): + def _parsebody(self, container, fp, firstbodyline=None): # Parse the body, but first split the payload on the content-type # boundary if present. boundary = container.get_boundary() @@ -152,6 +157,8 @@ class Parser: # boundary. separator = '--' + boundary payload = fp.read() + if firstbodyline is not None: + payload = firstbodyline + '\n' + payload # We use an RE here because boundaries can have trailing # whitespace. mo = re.search( @@ -260,7 +267,10 @@ class Parser: self._parsebody(msg, fp) container.attach(msg) else: - container.set_payload(fp.read()) + text = fp.read() + if firstbodyline is not None: + text = firstbodyline + '\n' + text + container.set_payload(text) @@ -274,6 +284,9 @@ class HeaderParser(Parser): Parsing with this subclass can be considerably faster if all you're interested in is the message headers. """ - def _parsebody(self, container, fp): + def _parsebody(self, container, fp, firstbodyline=None): # Consume but do not parse, the body - container.set_payload(fp.read()) + text = fp.read() + if firstbodyline is not None: + text = firstbodyline + '\n' + text + container.set_payload(text) -- cgit v0.12