diff options
-rw-r--r-- | Lib/email/Header.py | 254 |
1 files changed, 154 insertions, 100 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py index 83c5843..abc342c 100644 --- a/Lib/email/Header.py +++ b/Lib/email/Header.py @@ -4,10 +4,12 @@ """Header encoding and decoding functionality.""" import re +import binascii from types import StringType, UnicodeType import email.quopriMIME import email.base64MIME +from email.Errors import HeaderParseError from email.Charset import Charset try: @@ -25,6 +27,7 @@ except NameError: CRLFSPACE = '\r\n ' CRLF = '\r\n' NL = '\n' +SPACE = ' ' SPACE8 = ' ' * 8 EMPTYSTRING = '' @@ -47,6 +50,13 @@ ecre = re.compile(r''' \?= # literal ?= ''', re.VERBOSE | re.IGNORECASE) +pcre = re.compile('([,;])') + +# Field name regexp, including trailing colon, but not separating whitespace, +# according to RFC 2822. Character range is from tilde to exclamation mark. +# For use with .match() +fcre = re.compile(r'[\041-\176]+:$') + # Helpers @@ -61,6 +71,9 @@ def decode_header(header): decoded parts of the header. Charset is None for non-encoded parts of the header, otherwise a lower-case string containing the name of the character set specified in the encoded string. + + An email.Errors.HeaderParseError may be raised when certain decoding error + occurs (e.g. a base64 decoding exception). """ # If no encoding, just return the header header = str(header) @@ -85,12 +98,18 @@ def decode_header(header): if parts: charset, encoding = [s.lower() for s in parts[0:2]] encoded = parts[2] - dec = '' + dec = None if encoding == 'q': dec = email.quopriMIME.header_decode(encoded) elif encoding == 'b': - dec = email.base64MIME.decode(encoded) - else: + try: + dec = email.base64MIME.decode(encoded) + except binascii.Error: + # Turn this into a higher level exception. BAW: Right + # now we throw the lower level exception away but + # when/if we get exception chaining, we'll preserve it. + raise HeaderParseError + if dec is None: dec = encoded if decoded and decoded[-1][1] == charset: @@ -126,7 +145,8 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None, class Header: - def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None, + def __init__(self, s=None, charset=None, + maxlinelen=None, header_name=None, continuation_ws=' ', errors='strict'): """Create a MIME-compliant header that can contain many character sets. @@ -253,13 +273,13 @@ class Header: assert False, 'utf-8 conversion failed' self._chunks.append((s, charset)) - def _split(self, s, charset, firstline=False): + def _split(self, s, charset, maxlinelen, splitchars): # Split up a header safely for use with encode_chunks. splittable = charset.to_splittable(s) - encoded = charset.from_splittable(splittable) + encoded = charset.from_splittable(splittable, True) elen = charset.encoded_header_len(encoded) - - if elen <= self._maxlinelen: + # If the line's encoded length first, just return it + if elen <= maxlinelen: return [(encoded, charset)] # If we have undetermined raw 8bit characters sitting in a byte # string, we really don't know what the right thing to do is. We @@ -267,7 +287,7 @@ class Header: # could break if we split it between pairs. The least harm seems to # be to not split the header at all, but that means they could go out # longer than maxlinelen. - elif charset == '8bit': + if charset == '8bit': return [(s, charset)] # BAW: I'm not sure what the right test here is. What we're trying to # do is be faithful to RFC 2822's recommendation that ($2.2.3): @@ -280,99 +300,30 @@ class Header: # For now, I can only imagine doing this when the charset is us-ascii, # although it's possible that other charsets may also benefit from the # higher-level syntactic breaks. - # elif charset == 'us-ascii': - return self._ascii_split(s, charset, firstline) + return self._split_ascii(s, charset, maxlinelen, splitchars) # BAW: should we use encoded? elif elen == len(s): # We can split on _maxlinelen boundaries because we know that the # encoding won't change the size of the string - splitpnt = self._maxlinelen + splitpnt = maxlinelen first = charset.from_splittable(splittable[:splitpnt], False) last = charset.from_splittable(splittable[splitpnt:], False) else: - # Divide and conquer. - halfway = _floordiv(len(splittable), 2) - first = charset.from_splittable(splittable[:halfway], False) - last = charset.from_splittable(splittable[halfway:], False) - # Do the split - return self._split(first, charset, firstline) + \ - self._split(last, charset) - - def _ascii_split(self, s, charset, firstline): - # Attempt to split the line at the highest-level syntactic break - # possible. Note that we don't have a lot of smarts about field - # syntax; we just try to break on semi-colons, then whitespace. - rtn = [] - lines = s.splitlines() - while lines: - line = lines.pop(0) - if firstline: - maxlinelen = self._firstlinelen - firstline = False - else: - #line = line.lstrip() - maxlinelen = self._maxlinelen - # Short lines can remain unchanged - if len(line.replace('\t', SPACE8)) <= maxlinelen: - rtn.append(line) - else: - oldlen = len(line) - # Try to break the line on semicolons, but if that doesn't - # work, try to split on folding whitespace. - while len(line) > maxlinelen: - i = line.rfind(';', 0, maxlinelen) - if i < 0: - break - rtn.append(line[:i] + ';') - line = line[i+1:] - # Is the remaining stuff still longer than maxlinelen? - if len(line) <= maxlinelen: - # Splitting on semis worked - rtn.append(line) - continue - # Splitting on semis didn't finish the job. If it did any - # work at all, stick the remaining junk on the front of the - # `lines' sequence and let the next pass do its thing. - if len(line) <> oldlen: - lines.insert(0, line) - continue - # Otherwise, splitting on semis didn't help at all. - parts = re.split(r'(\s+)', line) - if len(parts) == 1 or (len(parts) == 3 and - parts[0].endswith(':')): - # This line can't be split on whitespace. There's now - # little we can do to get this into maxlinelen. BAW: - # We're still potentially breaking the RFC by possibly - # allowing lines longer than the absolute maximum of 998 - # characters. For now, let it slide. - # - # len(parts) will be 1 if this line has no `Field: ' - # prefix, otherwise it will be len(3). - rtn.append(line) - continue - # There is whitespace we can split on. - first = parts.pop(0) - sublines = [first] - acc = len(first) - while parts: - len0 = len(parts[0]) - len1 = len(parts[1]) - if acc + len0 + len1 <= maxlinelen: - sublines.append(parts.pop(0)) - sublines.append(parts.pop(0)) - acc += len0 + len1 - else: - # Split it here, but don't forget to ignore the - # next whitespace-only part - if first <> '': - rtn.append(EMPTYSTRING.join(sublines)) - del parts[0] - first = parts.pop(0) - sublines = [first] - acc = len(first) - rtn.append(EMPTYSTRING.join(sublines)) - return [(chunk, charset) for chunk in rtn] + # Binary search for split point + first, last = _binsplit(splittable, charset, maxlinelen) + # first is of the proper length so just wrap it in the appropriate + # chrome. last must be recursively split. + fsplittable = charset.to_splittable(first) + fencoded = charset.from_splittable(fsplittable, True) + chunk = [(fencoded, charset)] + return chunk + self._split(last, charset, self._maxlinelen, splitchars) + + def _split_ascii(self, s, charset, firstlen, splitchars): + line = _split_ascii(s, firstlen, self._maxlinelen, + self._continuation_ws, splitchars) + lines = line.splitlines() + return zip(lines, [charset]*len(lines)) def _encode_chunks(self, newchunks): # MIME-encode a header with many different charsets and/or encodings. @@ -396,15 +347,14 @@ class Header: chunks = [] for header, charset in newchunks: if charset is None or charset.header_encoding is None: - # There's no encoding for this chunk's charsets - _max_append(chunks, header, self._maxlinelen) + s = header else: - _max_append(chunks, charset.header_encode(header), - self._maxlinelen, ' ') + s = charset.header_encode(header) + _max_append(chunks, s, self._maxlinelen, ' ') joiner = NL + self._continuation_ws return joiner.join(chunks) - def encode(self): + def encode(self, splitchars=';, '): """Encode a message header into an RFC-compliant format. There are many issues involved in converting a given string for use in @@ -421,8 +371,112 @@ class Header: If the given charset is not known or an error occurs during conversion, this function will return the header untouched. + + Optional splitchars is a string containing characters to split long + ASCII lines on, in rough support of RFC 2822's `highest level + syntactic breaks'. This doesn't affect RFC 2047 encoded lines. """ newchunks = [] + maxlinelen = self._firstlinelen + lastlen = 0 for s, charset in self._chunks: - newchunks += self._split(s, charset, True) + # The first bit of the next chunk should be just long enough to + # fill the next line. Don't forget the space separating the + # encoded words. + targetlen = maxlinelen - lastlen - 1 + if targetlen < charset.encoded_header_len(''): + # Stick it on the next line + targetlen = maxlinelen + newchunks += self._split(s, charset, targetlen, splitchars) + lastchunk, lastcharset = newchunks[-1] + lastlen = lastcharset.encoded_header_len(lastchunk) return self._encode_chunks(newchunks) + + + +def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars): + lines = [] + maxlen = firstlen + for line in s.splitlines(): + if len(line) < maxlen: + lines.append(line) + maxlen = restlen + continue + # Attempt to split the line at the highest-level syntactic break + # possible. Note that we don't have a lot of smarts about field + # syntax; we just try to break on semi-colons, then commas, then + # whitespace. + for ch in splitchars: + if line.find(ch) >= 0: + break + else: + # There's nothing useful to split the line on, not even spaces, so + # just append this line unchanged + lines.append(line) + maxlen = restlen + continue + # Now split the line on the character plus trailing whitespace + cre = re.compile(r'%s\s*' % ch) + if ch in ';,': + eol = ch + else: + eol = '' + joiner = eol + ' ' + joinlen = len(joiner) + wslen = len(continuation_ws.replace('\t', SPACE8)) + this = [] + linelen = 0 + for part in cre.split(line): + curlen = linelen + max(0, len(this)-1) * joinlen + partlen = len(part) + onfirstline = not lines + # We don't want to split after the field name, if we're on the + # first line and the field name is present in the header string. + if ch == ' ' and onfirstline and \ + len(this) == 1 and fcre.match(this[0]): + this.append(part) + linelen += partlen + elif curlen + partlen > maxlen: + if this: + lines.append(joiner.join(this) + eol) + this = [part] + linelen = wslen + partlen + maxlen = restlen + else: + this.append(part) + linelen += partlen + # Put any left over parts on a line by themselves + if this: + lines.append(joiner.join(this)) + linejoiner = '\n' + continuation_ws + return linejoiner.join(lines) + + + +def _binsplit(splittable, charset, maxlinelen): + i = 0 + j = len(splittable) + while i < j: + # Invariants: + # 1. splittable[:k] fits for all k <= i (note that we *assume*, + # at the start, that splittable[:0] fits). + # 2. splittable[:k] does not fit for any k > j (at the start, + # this means we shouldn't look at any k > len(splittable)). + # 3. We don't know about splittable[:k] for k in i+1..j. + # 4. We want to set i to the largest k that fits, with i <= k <= j. + # + m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j + chunk = charset.from_splittable(splittable[:m], True) + chunklen = charset.encoded_header_len(chunk) + if chunklen <= maxlinelen: + # m is acceptable, so is a new lower bound. + i = m + else: + # m is not acceptable, so final i must be < j. + j = m - 1 + # i == j. Invariant #1 implies that splittable[:i] fits, and + # invariant #2 implies that splittable[:i+1] does not fit, so i + # is what we're looking for. + first = charset.from_splittable(splittable[:i], False) + last = charset.from_splittable(splittable[i:], False) + return first, last |