summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>2002-11-05 21:44:06 (GMT)
committerBarry Warsaw <barry@python.org>2002-11-05 21:44:06 (GMT)
commitda2525ed2a9c8ee6219b469fbbae47b9cec2c4ba (patch)
treea2ba689a5ee2ea86cac8a113adc58e986243d76e
parenta0a00761a500478223f0e076983fddcdfb4ac587 (diff)
downloadcpython-da2525ed2a9c8ee6219b469fbbae47b9cec2c4ba.zip
cpython-da2525ed2a9c8ee6219b469fbbae47b9cec2c4ba.tar.gz
cpython-da2525ed2a9c8ee6219b469fbbae47b9cec2c4ba.tar.bz2
parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527,
where in lax parsing, the first non-header line after a header block (e.g. the first line not containing a colon, and not a continuation), can be treated as the first body line, even without the RFC mandated blank line separator. rfc822 had this behavior, and I vaguely remember problems with this, but can't remember details. In any event, all the tests still pass, so I guess we'll find out. ;/ This patch works by returning the non-header, non-continuation line from _parseheader() and using that as the first header line prepended to fp.read() if given. It's usually None. We use this approach instead of trying to seek/tell the file-like object.
-rw-r--r--Lib/email/Parser.py31
1 files changed, 22 insertions, 9 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py
index 6dfa4d3..8e0ac44 100644
--- a/Lib/email/Parser.py
+++ b/Lib/email/Parser.py
@@ -59,9 +59,9 @@ class Parser:
meaning it parses the entire contents of the file.
"""
root = self._class()
- self._parseheaders(root, fp)
+ firstbodyline = self._parseheaders(root, fp)
if not headersonly:
- self._parsebody(root, fp)
+ self._parsebody(root, fp, firstbodyline)
return root
def parsestr(self, text, headersonly=False):
@@ -80,6 +80,7 @@ class Parser:
lastheader = ''
lastvalue = []
lineno = 0
+ firstbodyline = None
while True:
# Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant
@@ -120,13 +121,16 @@ class Parser:
if i < 0:
if self._strict:
raise Errors.HeaderParseError(
- "Not a header, not a continuation: ``%s''"%line)
+ "Not a header, not a continuation: ``%s''" % line)
elif lineno == 1 and line.startswith('--'):
# allow through duplicate boundary tags.
continue
else:
- raise Errors.HeaderParseError(
- "Not a header, not a continuation: ``%s''"%line)
+ # There was no separating blank line as mandated by RFC
+ # 2822, but we're in non-strict mode. So just offer up
+ # this current line as the first body line.
+ firstbodyline = line
+ break
if lastheader:
container[lastheader] = NL.join(lastvalue)
lastheader = line[:i]
@@ -134,8 +138,9 @@ class Parser:
# Make sure we retain the last header
if lastheader:
container[lastheader] = NL.join(lastvalue)
+ return firstbodyline
- def _parsebody(self, container, fp):
+ def _parsebody(self, container, fp, firstbodyline=None):
# Parse the body, but first split the payload on the content-type
# boundary if present.
boundary = container.get_boundary()
@@ -152,6 +157,8 @@ class Parser:
# boundary.
separator = '--' + boundary
payload = fp.read()
+ if firstbodyline is not None:
+ payload = firstbodyline + '\n' + payload
# We use an RE here because boundaries can have trailing
# whitespace.
mo = re.search(
@@ -260,7 +267,10 @@ class Parser:
self._parsebody(msg, fp)
container.attach(msg)
else:
- container.set_payload(fp.read())
+ text = fp.read()
+ if firstbodyline is not None:
+ text = firstbodyline + '\n' + text
+ container.set_payload(text)
@@ -274,6 +284,9 @@ class HeaderParser(Parser):
Parsing with this subclass can be considerably faster if all you're
interested in is the message headers.
"""
- def _parsebody(self, container, fp):
+ def _parsebody(self, container, fp, firstbodyline=None):
# Consume but do not parse, the body
- container.set_payload(fp.read())
+ text = fp.read()
+ if firstbodyline is not None:
+ text = firstbodyline + '\n' + text
+ container.set_payload(text)