summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/email/Header.py254
1 files changed, 154 insertions, 100 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py
index 83c5843..abc342c 100644
--- a/Lib/email/Header.py
+++ b/Lib/email/Header.py
@@ -4,10 +4,12 @@
"""Header encoding and decoding functionality."""
import re
+import binascii
from types import StringType, UnicodeType
import email.quopriMIME
import email.base64MIME
+from email.Errors import HeaderParseError
from email.Charset import Charset
try:
@@ -25,6 +27,7 @@ except NameError:
CRLFSPACE = '\r\n '
CRLF = '\r\n'
NL = '\n'
+SPACE = ' '
SPACE8 = ' ' * 8
EMPTYSTRING = ''
@@ -47,6 +50,13 @@ ecre = re.compile(r'''
\?= # literal ?=
''', re.VERBOSE | re.IGNORECASE)
+pcre = re.compile('([,;])')
+
+# Field name regexp, including trailing colon, but not separating whitespace,
+# according to RFC 2822. Character range is from tilde to exclamation mark.
+# For use with .match()
+fcre = re.compile(r'[\041-\176]+:$')
+
# Helpers
@@ -61,6 +71,9 @@ def decode_header(header):
decoded parts of the header. Charset is None for non-encoded parts of the
header, otherwise a lower-case string containing the name of the character
set specified in the encoded string.
+
+ An email.Errors.HeaderParseError may be raised when certain decoding error
+ occurs (e.g. a base64 decoding exception).
"""
# If no encoding, just return the header
header = str(header)
@@ -85,12 +98,18 @@ def decode_header(header):
if parts:
charset, encoding = [s.lower() for s in parts[0:2]]
encoded = parts[2]
- dec = ''
+ dec = None
if encoding == 'q':
dec = email.quopriMIME.header_decode(encoded)
elif encoding == 'b':
- dec = email.base64MIME.decode(encoded)
- else:
+ try:
+ dec = email.base64MIME.decode(encoded)
+ except binascii.Error:
+ # Turn this into a higher level exception. BAW: Right
+ # now we throw the lower level exception away but
+ # when/if we get exception chaining, we'll preserve it.
+ raise HeaderParseError
+ if dec is None:
dec = encoded
if decoded and decoded[-1][1] == charset:
@@ -126,7 +145,8 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None,
class Header:
- def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
+ def __init__(self, s=None, charset=None,
+ maxlinelen=None, header_name=None,
continuation_ws=' ', errors='strict'):
"""Create a MIME-compliant header that can contain many character sets.
@@ -253,13 +273,13 @@ class Header:
assert False, 'utf-8 conversion failed'
self._chunks.append((s, charset))
- def _split(self, s, charset, firstline=False):
+ def _split(self, s, charset, maxlinelen, splitchars):
# Split up a header safely for use with encode_chunks.
splittable = charset.to_splittable(s)
- encoded = charset.from_splittable(splittable)
+ encoded = charset.from_splittable(splittable, True)
elen = charset.encoded_header_len(encoded)
-
- if elen <= self._maxlinelen:
+ # If the line's encoded length first, just return it
+ if elen <= maxlinelen:
return [(encoded, charset)]
# If we have undetermined raw 8bit characters sitting in a byte
# string, we really don't know what the right thing to do is. We
@@ -267,7 +287,7 @@ class Header:
# could break if we split it between pairs. The least harm seems to
# be to not split the header at all, but that means they could go out
# longer than maxlinelen.
- elif charset == '8bit':
+ if charset == '8bit':
return [(s, charset)]
# BAW: I'm not sure what the right test here is. What we're trying to
# do is be faithful to RFC 2822's recommendation that ($2.2.3):
@@ -280,99 +300,30 @@ class Header:
# For now, I can only imagine doing this when the charset is us-ascii,
# although it's possible that other charsets may also benefit from the
# higher-level syntactic breaks.
- #
elif charset == 'us-ascii':
- return self._ascii_split(s, charset, firstline)
+ return self._split_ascii(s, charset, maxlinelen, splitchars)
# BAW: should we use encoded?
elif elen == len(s):
# We can split on _maxlinelen boundaries because we know that the
# encoding won't change the size of the string
- splitpnt = self._maxlinelen
+ splitpnt = maxlinelen
first = charset.from_splittable(splittable[:splitpnt], False)
last = charset.from_splittable(splittable[splitpnt:], False)
else:
- # Divide and conquer.
- halfway = _floordiv(len(splittable), 2)
- first = charset.from_splittable(splittable[:halfway], False)
- last = charset.from_splittable(splittable[halfway:], False)
- # Do the split
- return self._split(first, charset, firstline) + \
- self._split(last, charset)
-
- def _ascii_split(self, s, charset, firstline):
- # Attempt to split the line at the highest-level syntactic break
- # possible. Note that we don't have a lot of smarts about field
- # syntax; we just try to break on semi-colons, then whitespace.
- rtn = []
- lines = s.splitlines()
- while lines:
- line = lines.pop(0)
- if firstline:
- maxlinelen = self._firstlinelen
- firstline = False
- else:
- #line = line.lstrip()
- maxlinelen = self._maxlinelen
- # Short lines can remain unchanged
- if len(line.replace('\t', SPACE8)) <= maxlinelen:
- rtn.append(line)
- else:
- oldlen = len(line)
- # Try to break the line on semicolons, but if that doesn't
- # work, try to split on folding whitespace.
- while len(line) > maxlinelen:
- i = line.rfind(';', 0, maxlinelen)
- if i < 0:
- break
- rtn.append(line[:i] + ';')
- line = line[i+1:]
- # Is the remaining stuff still longer than maxlinelen?
- if len(line) <= maxlinelen:
- # Splitting on semis worked
- rtn.append(line)
- continue
- # Splitting on semis didn't finish the job. If it did any
- # work at all, stick the remaining junk on the front of the
- # `lines' sequence and let the next pass do its thing.
- if len(line) <> oldlen:
- lines.insert(0, line)
- continue
- # Otherwise, splitting on semis didn't help at all.
- parts = re.split(r'(\s+)', line)
- if len(parts) == 1 or (len(parts) == 3 and
- parts[0].endswith(':')):
- # This line can't be split on whitespace. There's now
- # little we can do to get this into maxlinelen. BAW:
- # We're still potentially breaking the RFC by possibly
- # allowing lines longer than the absolute maximum of 998
- # characters. For now, let it slide.
- #
- # len(parts) will be 1 if this line has no `Field: '
- # prefix, otherwise it will be len(3).
- rtn.append(line)
- continue
- # There is whitespace we can split on.
- first = parts.pop(0)
- sublines = [first]
- acc = len(first)
- while parts:
- len0 = len(parts[0])
- len1 = len(parts[1])
- if acc + len0 + len1 <= maxlinelen:
- sublines.append(parts.pop(0))
- sublines.append(parts.pop(0))
- acc += len0 + len1
- else:
- # Split it here, but don't forget to ignore the
- # next whitespace-only part
- if first <> '':
- rtn.append(EMPTYSTRING.join(sublines))
- del parts[0]
- first = parts.pop(0)
- sublines = [first]
- acc = len(first)
- rtn.append(EMPTYSTRING.join(sublines))
- return [(chunk, charset) for chunk in rtn]
+ # Binary search for split point
+ first, last = _binsplit(splittable, charset, maxlinelen)
+ # first is of the proper length so just wrap it in the appropriate
+ # chrome. last must be recursively split.
+ fsplittable = charset.to_splittable(first)
+ fencoded = charset.from_splittable(fsplittable, True)
+ chunk = [(fencoded, charset)]
+ return chunk + self._split(last, charset, self._maxlinelen, splitchars)
+
+ def _split_ascii(self, s, charset, firstlen, splitchars):
+ line = _split_ascii(s, firstlen, self._maxlinelen,
+ self._continuation_ws, splitchars)
+ lines = line.splitlines()
+ return zip(lines, [charset]*len(lines))
def _encode_chunks(self, newchunks):
# MIME-encode a header with many different charsets and/or encodings.
@@ -396,15 +347,14 @@ class Header:
chunks = []
for header, charset in newchunks:
if charset is None or charset.header_encoding is None:
- # There's no encoding for this chunk's charsets
- _max_append(chunks, header, self._maxlinelen)
+ s = header
else:
- _max_append(chunks, charset.header_encode(header),
- self._maxlinelen, ' ')
+ s = charset.header_encode(header)
+ _max_append(chunks, s, self._maxlinelen, ' ')
joiner = NL + self._continuation_ws
return joiner.join(chunks)
- def encode(self):
+ def encode(self, splitchars=';, '):
"""Encode a message header into an RFC-compliant format.
There are many issues involved in converting a given string for use in
@@ -421,8 +371,112 @@ class Header:
If the given charset is not known or an error occurs during
conversion, this function will return the header untouched.
+
+ Optional splitchars is a string containing characters to split long
+ ASCII lines on, in rough support of RFC 2822's `highest level
+ syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
"""
newchunks = []
+ maxlinelen = self._firstlinelen
+ lastlen = 0
for s, charset in self._chunks:
- newchunks += self._split(s, charset, True)
+ # The first bit of the next chunk should be just long enough to
+ # fill the next line. Don't forget the space separating the
+ # encoded words.
+ targetlen = maxlinelen - lastlen - 1
+ if targetlen < charset.encoded_header_len(''):
+ # Stick it on the next line
+ targetlen = maxlinelen
+ newchunks += self._split(s, charset, targetlen, splitchars)
+ lastchunk, lastcharset = newchunks[-1]
+ lastlen = lastcharset.encoded_header_len(lastchunk)
return self._encode_chunks(newchunks)
+
+
+
+def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
+ lines = []
+ maxlen = firstlen
+ for line in s.splitlines():
+ if len(line) < maxlen:
+ lines.append(line)
+ maxlen = restlen
+ continue
+ # Attempt to split the line at the highest-level syntactic break
+ # possible. Note that we don't have a lot of smarts about field
+ # syntax; we just try to break on semi-colons, then commas, then
+ # whitespace.
+ for ch in splitchars:
+ if line.find(ch) >= 0:
+ break
+ else:
+ # There's nothing useful to split the line on, not even spaces, so
+ # just append this line unchanged
+ lines.append(line)
+ maxlen = restlen
+ continue
+ # Now split the line on the character plus trailing whitespace
+ cre = re.compile(r'%s\s*' % ch)
+ if ch in ';,':
+ eol = ch
+ else:
+ eol = ''
+ joiner = eol + ' '
+ joinlen = len(joiner)
+ wslen = len(continuation_ws.replace('\t', SPACE8))
+ this = []
+ linelen = 0
+ for part in cre.split(line):
+ curlen = linelen + max(0, len(this)-1) * joinlen
+ partlen = len(part)
+ onfirstline = not lines
+ # We don't want to split after the field name, if we're on the
+ # first line and the field name is present in the header string.
+ if ch == ' ' and onfirstline and \
+ len(this) == 1 and fcre.match(this[0]):
+ this.append(part)
+ linelen += partlen
+ elif curlen + partlen > maxlen:
+ if this:
+ lines.append(joiner.join(this) + eol)
+ this = [part]
+ linelen = wslen + partlen
+ maxlen = restlen
+ else:
+ this.append(part)
+ linelen += partlen
+ # Put any left over parts on a line by themselves
+ if this:
+ lines.append(joiner.join(this))
+ linejoiner = '\n' + continuation_ws
+ return linejoiner.join(lines)
+
+
+
+def _binsplit(splittable, charset, maxlinelen):
+ i = 0
+ j = len(splittable)
+ while i < j:
+ # Invariants:
+ # 1. splittable[:k] fits for all k <= i (note that we *assume*,
+ # at the start, that splittable[:0] fits).
+ # 2. splittable[:k] does not fit for any k > j (at the start,
+ # this means we shouldn't look at any k > len(splittable)).
+ # 3. We don't know about splittable[:k] for k in i+1..j.
+ # 4. We want to set i to the largest k that fits, with i <= k <= j.
+ #
+ m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
+ chunk = charset.from_splittable(splittable[:m], True)
+ chunklen = charset.encoded_header_len(chunk)
+ if chunklen <= maxlinelen:
+ # m is acceptable, so is a new lower bound.
+ i = m
+ else:
+ # m is not acceptable, so final i must be < j.
+ j = m - 1
+ # i == j. Invariant #1 implies that splittable[:i] fits, and
+ # invariant #2 implies that splittable[:i+1] does not fit, so i
+ # is what we're looking for.
+ first = charset.from_splittable(splittable[:i], False)
+ last = charset.from_splittable(splittable[i:], False)
+ return first, last