1 files changed, 154 insertions, 100 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py
index 83c5843..abc342c 100644
--- a/Lib/email/Header.py
+++ b/Lib/email/Header.py
@@ -4,10 +4,12 @@
 """Header encoding and decoding functionality."""
 
 import re
+import binascii
 from types import StringType, UnicodeType
 
 import email.quopriMIME
 import email.base64MIME
+from email.Errors import HeaderParseError
 from email.Charset import Charset
 
 try:
@@ -25,6 +27,7 @@ except NameError:
 CRLFSPACE = '\r\n '
 CRLF = '\r\n'
 NL = '\n'
+SPACE = ' '
 SPACE8 = ' ' * 8
 EMPTYSTRING = ''
 
@@ -47,6 +50,13 @@ ecre = re.compile(r'''
   \?=                   # literal ?=
   ''', re.VERBOSE | re.IGNORECASE)
 
+pcre = re.compile('([,;])')
+
+# Field name regexp, including trailing colon, but not separating whitespace,
+# according to RFC 2822.  Character range is from tilde to exclamation mark.
+# For use with .match()
+fcre = re.compile(r'[\041-\176]+:$')
+
 
 
 # Helpers
@@ -61,6 +71,9 @@ def decode_header(header):
     decoded parts of the header.  Charset is None for non-encoded parts of the
     header, otherwise a lower-case string containing the name of the character
     set specified in the encoded string.
+
+    An email.Errors.HeaderParseError may be raised when certain decoding error
+    occurs (e.g. a base64 decoding exception).
     """
     # If no encoding, just return the header
     header = str(header)
@@ -85,12 +98,18 @@ def decode_header(header):
             if parts:
                 charset, encoding = [s.lower() for s in parts[0:2]]
                 encoded = parts[2]
-                dec = ''
+                dec = None
                 if encoding == 'q':
                     dec = email.quopriMIME.header_decode(encoded)
                 elif encoding == 'b':
-                    dec = email.base64MIME.decode(encoded)
-                else:
+                    try:
+                        dec = email.base64MIME.decode(encoded)
+                    except binascii.Error:
+                        # Turn this into a higher level exception.  BAW: Right
+                        # now we throw the lower level exception away but
+                        # when/if we get exception chaining, we'll preserve it.
+                        raise HeaderParseError
+                if dec is None:
                     dec = encoded
 
                 if decoded and decoded[-1][1] == charset:
@@ -126,7 +145,8 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None,
 
 
 class Header:
-    def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
+    def __init__(self, s=None, charset=None,
+                 maxlinelen=None, header_name=None,
                  continuation_ws=' ', errors='strict'):
         """Create a MIME-compliant header that can contain many character sets.
 
@@ -253,13 +273,13 @@ class Header:
                     assert False, 'utf-8 conversion failed'
         self._chunks.append((s, charset))
 
-    def _split(self, s, charset, firstline=False):
+    def _split(self, s, charset, maxlinelen, splitchars):
         # Split up a header safely for use with encode_chunks.
         splittable = charset.to_splittable(s)
-        encoded = charset.from_splittable(splittable)
+        encoded = charset.from_splittable(splittable, True)
         elen = charset.encoded_header_len(encoded)
-
-        if elen <= self._maxlinelen:
+        # If the line's encoded length first, just return it
+        if elen <= maxlinelen:
             return [(encoded, charset)]
         # If we have undetermined raw 8bit characters sitting in a byte
         # string, we really don't know what the right thing to do is.  We
@@ -267,7 +287,7 @@ class Header:
         # could break if we split it between pairs.  The least harm seems to
         # be to not split the header at all, but that means they could go out
         # longer than maxlinelen.
-        elif charset == '8bit':
+        if charset == '8bit':
             return [(s, charset)]
         # BAW: I'm not sure what the right test here is.  What we're trying to
         # do is be faithful to RFC 2822's recommendation that ($2.2.3):
@@ -280,99 +300,30 @@ class Header:
         # For now, I can only imagine doing this when the charset is us-ascii,
         # although it's possible that other charsets may also benefit from the
         # higher-level syntactic breaks.
-        #
         elif charset == 'us-ascii':
-            return self._ascii_split(s, charset, firstline)
+            return self._split_ascii(s, charset, maxlinelen, splitchars)
         # BAW: should we use encoded?
         elif elen == len(s):
             # We can split on _maxlinelen boundaries because we know that the
             # encoding won't change the size of the string
-            splitpnt = self._maxlinelen
+            splitpnt = maxlinelen
             first = charset.from_splittable(splittable[:splitpnt], False)
             last = charset.from_splittable(splittable[splitpnt:], False)
         else:
-            # Divide and conquer.
-            halfway = _floordiv(len(splittable), 2)
-            first = charset.from_splittable(splittable[:halfway], False)
-            last = charset.from_splittable(splittable[halfway:], False)
-        # Do the split
-        return self._split(first, charset, firstline) + \
-               self._split(last, charset)
-
-    def _ascii_split(self, s, charset, firstline):
-        # Attempt to split the line at the highest-level syntactic break
-        # possible.  Note that we don't have a lot of smarts about field
-        # syntax; we just try to break on semi-colons, then whitespace.
-        rtn = []
-        lines = s.splitlines()
-        while lines:
-            line = lines.pop(0)
-            if firstline:
-                maxlinelen = self._firstlinelen
-                firstline = False
-            else:
-                #line = line.lstrip()
-                maxlinelen = self._maxlinelen
-            # Short lines can remain unchanged
-            if len(line.replace('\t', SPACE8)) <= maxlinelen:
-                rtn.append(line)
-            else:
-                oldlen = len(line)
-                # Try to break the line on semicolons, but if that doesn't
-                # work, try to split on folding whitespace.
-                while len(line) > maxlinelen:
-                    i = line.rfind(';', 0, maxlinelen)
-                    if i < 0:
-                        break
-                    rtn.append(line[:i] + ';')
-                    line = line[i+1:]
-                # Is the remaining stuff still longer than maxlinelen?
-                if len(line) <= maxlinelen:
-                    # Splitting on semis worked
-                    rtn.append(line)
-                    continue
-                # Splitting on semis didn't finish the job.  If it did any
-                # work at all, stick the remaining junk on the front of the
-                # `lines' sequence and let the next pass do its thing.
-                if len(line) <> oldlen:
-                    lines.insert(0, line)
-                    continue
-                # Otherwise, splitting on semis didn't help at all.
-                parts = re.split(r'(\s+)', line)
-                if len(parts) == 1 or (len(parts) == 3 and
-                                       parts[0].endswith(':')):
-                    # This line can't be split on whitespace.  There's now
-                    # little we can do to get this into maxlinelen.  BAW:
-                    # We're still potentially breaking the RFC by possibly
-                    # allowing lines longer than the absolute maximum of 998
-                    # characters.  For now, let it slide.
-                    #
-                    # len(parts) will be 1 if this line has no `Field: '
-                    # prefix, otherwise it will be len(3).
-                    rtn.append(line)
-                    continue
-                # There is whitespace we can split on.
-                first = parts.pop(0)
-                sublines = [first]
-                acc = len(first)
-                while parts:
-                    len0 = len(parts[0])
-                    len1 = len(parts[1])
-                    if acc + len0 + len1 <= maxlinelen:
-                        sublines.append(parts.pop(0))
-                        sublines.append(parts.pop(0))
-                        acc += len0 + len1
-                    else:
-                        # Split it here, but don't forget to ignore the
-                        # next whitespace-only part
-                        if first <> '':
-                            rtn.append(EMPTYSTRING.join(sublines))
-                        del parts[0]
-                        first = parts.pop(0)
-                        sublines = [first]
-                        acc = len(first)
-                rtn.append(EMPTYSTRING.join(sublines))
-        return [(chunk, charset) for chunk in rtn]
+            # Binary search for split point
+            first, last = _binsplit(splittable, charset, maxlinelen)
+        # first is of the proper length so just wrap it in the appropriate
+        # chrome.  last must be recursively split.
+        fsplittable = charset.to_splittable(first)
+        fencoded = charset.from_splittable(fsplittable, True)
+        chunk = [(fencoded, charset)]
+        return chunk + self._split(last, charset, self._maxlinelen, splitchars)
+
+    def _split_ascii(self, s, charset, firstlen, splitchars):
+        line = _split_ascii(s, firstlen, self._maxlinelen,
+                            self._continuation_ws, splitchars)
+        lines = line.splitlines()
+        return zip(lines, [charset]*len(lines))
 
     def _encode_chunks(self, newchunks):
         # MIME-encode a header with many different charsets and/or encodings.
@@ -396,15 +347,14 @@ class Header:
         chunks = []
         for header, charset in newchunks:
             if charset is None or charset.header_encoding is None:
-                # There's no encoding for this chunk's charsets
-                _max_append(chunks, header, self._maxlinelen)
+                s = header
             else:
-                _max_append(chunks, charset.header_encode(header),
-                            self._maxlinelen, ' ')
+                s = charset.header_encode(header)
+            _max_append(chunks, s, self._maxlinelen, ' ')
         joiner = NL + self._continuation_ws
         return joiner.join(chunks)
 
-    def encode(self):
+    def encode(self, splitchars=';, '):
         """Encode a message header into an RFC-compliant format.
 
         There are many issues involved in converting a given string for use in
@@ -421,8 +371,112 @@ class Header:
 
         If the given charset is not known or an error occurs during
         conversion, this function will return the header untouched.
+
+        Optional splitchars is a string containing characters to split long
+        ASCII lines on, in rough support of RFC 2822's `highest level
+        syntactic breaks'.  This doesn't affect RFC 2047 encoded lines.
         """
         newchunks = []
+        maxlinelen = self._firstlinelen
+        lastlen = 0
         for s, charset in self._chunks:
-            newchunks += self._split(s, charset, True)
+            # The first bit of the next chunk should be just long enough to
+            # fill the next line.  Don't forget the space separating the
+            # encoded words.
+            targetlen = maxlinelen - lastlen - 1
+            if targetlen < charset.encoded_header_len(''):
+                # Stick it on the next line
+                targetlen = maxlinelen
+            newchunks += self._split(s, charset, targetlen, splitchars)
+            lastchunk, lastcharset = newchunks[-1]
+            lastlen = lastcharset.encoded_header_len(lastchunk)
         return self._encode_chunks(newchunks)
+
+
+
+def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
+    lines = []
+    maxlen = firstlen
+    for line in s.splitlines():
+        if len(line) < maxlen:
+            lines.append(line)
+            maxlen = restlen
+            continue
+        # Attempt to split the line at the highest-level syntactic break
+        # possible.  Note that we don't have a lot of smarts about field
+        # syntax; we just try to break on semi-colons, then commas, then
+        # whitespace.
+        for ch in splitchars:
+            if line.find(ch) >= 0:
+                break
+        else:
+            # There's nothing useful to split the line on, not even spaces, so
+            # just append this line unchanged
+            lines.append(line)
+            maxlen = restlen
+            continue
+        # Now split the line on the character plus trailing whitespace
+        cre = re.compile(r'%s\s*' % ch)
+        if ch in ';,':
+            eol = ch
+        else:
+            eol = ''
+        joiner = eol + ' '
+        joinlen = len(joiner)
+        wslen = len(continuation_ws.replace('\t', SPACE8))
+        this = []
+        linelen = 0
+        for part in cre.split(line):
+            curlen = linelen + max(0, len(this)-1) * joinlen
+            partlen = len(part)
+            onfirstline = not lines
+            # We don't want to split after the field name, if we're on the
+            # first line and the field name is present in the header string.
+            if ch == ' ' and onfirstline and \
+                   len(this) == 1 and fcre.match(this[0]):
+                this.append(part)
+                linelen += partlen
+            elif curlen + partlen > maxlen:
+                if this:
+                    lines.append(joiner.join(this) + eol)
+                this = [part]
+                linelen = wslen + partlen
+                maxlen = restlen
+            else:
+                this.append(part)
+                linelen += partlen
+        # Put any left over parts on a line by themselves
+        if this:
+            lines.append(joiner.join(this))
+    linejoiner = '\n' + continuation_ws
+    return linejoiner.join(lines)
+
+
+
+def _binsplit(splittable, charset, maxlinelen):
+    i = 0
+    j = len(splittable)
+    while i < j:
+        # Invariants:
+        # 1. splittable[:k] fits for all k <= i (note that we *assume*,
+        #    at the start, that splittable[:0] fits).
+        # 2. splittable[:k] does not fit for any k > j (at the start,
+        #    this means we shouldn't look at any k > len(splittable)).
+        # 3. We don't know about splittable[:k] for k in i+1..j.
+        # 4. We want to set i to the largest k that fits, with i <= k <= j.
+        #
+        m = (i+j+1) >> 1  # ceiling((i+j)/2); i < m <= j
+        chunk = charset.from_splittable(splittable[:m], True)
+        chunklen = charset.encoded_header_len(chunk)
+        if chunklen <= maxlinelen:
+            # m is acceptable, so is a new lower bound.
+            i = m
+        else:
+            # m is not acceptable, so final i must be < j.
+            j = m - 1
+    # i == j.  Invariant #1 implies that splittable[:i] fits, and
+    # invariant #2 implies that splittable[:i+1] does not fit, so i
+    # is what we're looking for.
+    first = charset.from_splittable(splittable[:i], False)
+    last  = charset.from_splittable(splittable[i:], False)
+    return first, last