summaryrefslogtreecommitdiffstats
path: root/Lib/email/Parser.py
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>2002-07-18 23:09:09 (GMT)
committerBarry Warsaw <barry@python.org>2002-07-18 23:09:09 (GMT)
commit7aeac9180e3d6df3d5db89ee7ff5941a81dc5a5d (patch)
tree0b729e7ad55be4cb686ce85bbdf9e0f8dfc8184d /Lib/email/Parser.py
parente21262ca9e286aee27741eb8bb69508a911ec10b (diff)
downloadcpython-7aeac9180e3d6df3d5db89ee7ff5941a81dc5a5d.zip
cpython-7aeac9180e3d6df3d5db89ee7ff5941a81dc5a5d.tar.gz
cpython-7aeac9180e3d6df3d5db89ee7ff5941a81dc5a5d.tar.bz2
Anthony Baxter's cleanup patch. Python project SF patch # 583190,
quoting: in non-strict mode, messages don't require a blank line at the end with a missing end-terminator. A single newline is sufficient now. Handle trailing whitespace at the end of a boundary. Had to switch from using string.split() to re.split() Handle whitespace on the end of a parameter list for Content-type. Handle whitespace on the end of a plain content-type header. Specifically, get_type(): Strip the content type string. _get_params_preserve(): Strip the parameter names and values on both sides. _parsebody(): Lots of changes as described above, with some stylistic changes by Barry (who hopefully didn't screw things up ;).
Diffstat (limited to 'Lib/email/Parser.py')
-rw-r--r--Lib/email/Parser.py43
1 files changed, 25 insertions, 18 deletions
diff --git a/Lib/email/Parser.py b/Lib/email/Parser.py
index 228adbc..3081107 100644
--- a/Lib/email/Parser.py
+++ b/Lib/email/Parser.py
@@ -124,19 +124,25 @@ class Parser:
if boundary:
preamble = epilogue = None
# Split into subparts. The first boundary we're looking for won't
- # have the leading newline since we're at the start of the body
- # text.
+ # always have a leading newline since we're at the start of the
+ # body text, and there's not always a preamble before the first
+ # boundary.
separator = '--' + boundary
payload = fp.read()
- start = payload.find(separator)
- if start < 0:
+ # We use an RE here because boundaries can have trailing
+ # whitespace.
+ mo = re.search(
+ r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
+ payload)
+ if not mo:
raise Errors.BoundaryError(
"Couldn't find starting boundary: %s" % boundary)
+ start = mo.start()
if start > 0:
# there's some pre-MIME boundary preamble
preamble = payload[0:start]
# Find out what kind of line endings we're using
- start += len(separator)
+ start += len(mo.group('sep')) + len(mo.group('ws'))
cre = re.compile('\r\n|\r|\n')
mo = cre.search(payload, start)
if mo:
@@ -151,31 +157,32 @@ class Parser:
terminator = mo.start()
linesep = mo.group('sep')
if mo.end() < len(payload):
- # there's some post-MIME boundary epilogue
+ # There's some post-MIME boundary epilogue
epilogue = payload[mo.end():]
elif self._strict:
raise Errors.BoundaryError(
"Couldn't find terminating boundary: %s" % boundary)
else:
- # handle the case of no trailing boundary. I hate mail clients.
- # check that it ends in a blank line
- endre = re.compile('(?P<sep>\r\n|\r|\n){2}$')
- mo = endre.search(payload)
+ # Handle the case of no trailing boundary. Check that it ends
+ # in a blank line. Some cases (spamspamspam) don't even have
+ # that!
+ mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
if not mo:
- raise Errors.BoundaryError(
- "Couldn't find terminating boundary, and no "+
- "trailing empty line")
- else:
- linesep = mo.group('sep')
- terminator = len(payload)
+ mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
+ if not mo:
+ raise Errors.BoundaryError(
+ 'No terminating boundary and no trailing empty line')
+ linesep = mo.group('sep')
+ terminator = len(payload)
# We split the textual payload on the boundary separator, which
# includes the trailing newline. If the container is a
# multipart/digest then the subparts are by default message/rfc822
# instead of text/plain. In that case, they'll have a optional
# block of MIME headers, then an empty line followed by the
# message headers.
- separator += linesep
- parts = payload[start:terminator].split(linesep + separator)
+ parts = re.split(
+ linesep + re.escape(separator) + r'[ \t]*' + linesep,
+ payload[start:terminator])
for part in parts:
if isdigest:
if part[0] == linesep: