summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>2002-06-28 23:46:53 (GMT)
committerBarry Warsaw <barry@python.org>2002-06-28 23:46:53 (GMT)
commit766125080f12f1ff0db7b1cf5b963e9a60324ab3 (patch)
tree3471083e0fa8ca356b87cc619cef24bc21717110
parent062749ac577be62c2f14fe61c2167c9c5b909ea3 (diff)
downloadcpython-766125080f12f1ff0db7b1cf5b963e9a60324ab3.zip
cpython-766125080f12f1ff0db7b1cf5b963e9a60324ab3.tar.gz
cpython-766125080f12f1ff0db7b1cf5b963e9a60324ab3.tar.bz2
Teach this class about "highest-level syntactic breaks" but only for
headers with no charset or 'us-ascii' charsets. Actually this is only partially true: we know about semicolons (but not true parameters) and we know about whitespace (but not technically folding whitespace). Still it should be good enough for all practical purposes. Other changes include: __init__(): Add a continuation_ws argument, which defaults to a single space. Set this to change the whitespace used for continuation lines when a header must be split. Also, changed the way header line lengths are calculated, so that they take into account continuation_ws (when tabs-expanded) and any provided header_name parameter. This should do much better on returning split headers for which the first and subsequent lines must fit into a specified width. guess_maxlinelen(): Removed. I don't think we need this method as part of the public API. encode_chunks() -> _encode_chunks(): I don't think we need this one as part of the public API either.
-rw-r--r--Lib/email/Header.py209
1 files changed, 151 insertions, 58 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py
index 714839e..c72f64d 100644
--- a/Lib/email/Header.py
+++ b/Lib/email/Header.py
@@ -16,7 +16,9 @@ except SyntaxError:
CRLFSPACE = '\r\n '
CRLF = '\r\n'
-NLSPACE = '\n '
+NL = '\n'
+SPACE8 = ' ' * 8
+EMPTYSTRING = ''
MAXLINELEN = 76
@@ -92,11 +94,12 @@ def decode_header(header):
class Header:
- def __init__(self, s, charset=None, maxlinelen=None, header_name=None):
+ def __init__(self, s, charset=None, maxlinelen=None, header_name=None,
+ continuation_ws=' '):
"""Create a MIME-compliant header that can contain many languages.
Specify the initial header value in s. Specify its character set as a
- Charset object in the charset argument. If none, a default Charset
+ Charset object in the charset argument. If None, a default Charset
instance will be used.
You can later append to the header with append(s, charset) below;
@@ -104,43 +107,41 @@ class Header:
here. In fact, it's optional, and if not given, defaults to the
charset specified in the constructor.
- The maximum line length can be specified explicitly via maxlinelen.
- You can also pass None for maxlinelen and the name of a header field
- (e.g. "Subject") to let the constructor guess the best line length to
- use. The default maxlinelen is 76.
+ The maximum line length can be specified explicit via maxlinelen. For
+ splitting the first line to a shorter value (to account for the field
+ header which isn't included in s, e.g. `Subject') pass in the name of
+ the field in header_name. The default maxlinelen is 76.
+
+ continuation_ws must be RFC 2822 compliant folding whitespace (usually
+ either a space or a hard tab) which will be prepended to continuation
+ lines.
"""
if charset is None:
charset = Charset()
self._charset = charset
+ self._continuation_ws = continuation_ws
+ cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
# BAW: I believe `chunks' and `maxlinelen' should be non-public.
self._chunks = []
self.append(s, charset)
if maxlinelen is None:
- if header_name is None:
- self._maxlinelen = MAXLINELEN
- else:
- self.guess_maxlinelen(header_name)
+ maxlinelen = MAXLINELEN
+ if header_name is None:
+ # We don't know anything about the field header so the first line
+ # is the same length as subsequent lines.
+ self._firstlinelen = maxlinelen
else:
- self._maxlinelen = maxlinelen
+ # The first line should be shorter to take into account the field
+ # header. Also subtract off 2 extra for the colon and space.
+ self._firstlinelen = maxlinelen - len(header_name) - 2
+ # Second and subsequent lines should subtract off the length in
+ # columns of the continuation whitespace prefix.
+ self._maxlinelen = maxlinelen - cws_expanded_len
def __str__(self):
"""A synonym for self.encode()."""
return self.encode()
- def guess_maxlinelen(self, s=None):
- """Guess the maximum length to make each header line.
-
- Given a header name (e.g. "Subject"), set this header's maximum line
- length to an appropriate length to avoid line wrapping. If s is not
- given, return the previous maximum line length and don't set it.
-
- Returns the new maximum line length.
- """
- # BAW: is this semantic necessary?
- if s is not None:
- self._maxlinelen = MAXLINELEN - len(s) - 2
- return self._maxlinelen
-
def append(self, s, charset=None):
"""Append string s with Charset charset to the MIME header.
@@ -150,7 +151,7 @@ class Header:
charset = self._charset
self._chunks.append((s, charset))
- def _split(self, s, charset):
+ def _split(self, s, charset, firstline=0):
# Split up a header safely for use with encode_chunks. BAW: this
# appears to be a private convenience method.
splittable = charset.to_splittable(s)
@@ -159,6 +160,20 @@ class Header:
if elen <= self._maxlinelen:
return [(encoded, charset)]
+ # BAW: I'm not sure what the right test here is. What we're trying to
+ # do is be faithful to RFC 2822's recommendation that ($2.2.3):
+ #
+ # "Note: Though structured field bodies are defined in such a way that
+ # folding can take place between many of the lexical tokens (and even
+ # within some of the lexical tokens), folding SHOULD be limited to
+ # placing the CRLF at higher-level syntactic breaks."
+ #
+ # For now, I can only imagine doing this when the charset is us-ascii,
+ # although it's possible that other charsets may also benefit from the
+ # higher-level syntactic breaks.
+ #
+ elif charset == 'us-ascii':
+ return self._ascii_split(s, charset, firstline)
# BAW: should we use encoded?
elif elen == len(s):
# We can split on _maxlinelen boundaries because we know that the
@@ -166,39 +181,91 @@ class Header:
splitpnt = self._maxlinelen
first = charset.from_splittable(splittable[:splitpnt], 0)
last = charset.from_splittable(splittable[splitpnt:], 0)
- return self._split(first, charset) + self._split(last, charset)
else:
# Divide and conquer.
halfway = _floordiv(len(splittable), 2)
first = charset.from_splittable(splittable[:halfway], 0)
last = charset.from_splittable(splittable[halfway:], 0)
- return self._split(first, charset) + self._split(last, charset)
-
- def encode(self):
- """Encode a message header, possibly converting charset and encoding.
-
- There are many issues involved in converting a given string for use in
- an email header. Only certain character sets are readable in most
- email clients, and as header strings can only contain a subset of
- 7-bit ASCII, care must be taken to properly convert and encode (with
- Base64 or quoted-printable) header strings. In addition, there is a
- 75-character length limit on any given encoded header field, so
- line-wrapping must be performed, even with double-byte character sets.
-
- This method will do its best to convert the string to the correct
- character set used in email, and encode and line wrap it safely with
- the appropriate scheme for that character set.
-
- If the given charset is not known or an error occurs during
- conversion, this function will return the header untouched.
- """
- newchunks = []
- for s, charset in self._chunks:
- newchunks += self._split(s, charset)
- self._chunks = newchunks
- return self.encode_chunks()
-
- def encode_chunks(self):
+ # Do the split
+ return self._split(first, charset, firstline) + \
+ self._split(last, charset)
+
+ def _ascii_split(self, s, charset, firstline):
+ # Attempt to split the line at the highest-level syntactic break
+ # possible. Note that we don't have a lot of smarts about field
+ # syntax; we just try to break on semi-colons, then whitespace.
+ rtn = []
+ lines = s.splitlines()
+ while lines:
+ line = lines.pop(0)
+ if firstline:
+ maxlinelen = self._firstlinelen
+ firstline = 0
+ else:
+ line = line.lstrip()
+ maxlinelen = self._maxlinelen
+ # Short lines can remain unchanged
+ if len(line.replace('\t', SPACE8)) <= maxlinelen:
+ rtn.append(line)
+ else:
+ oldlen = len(line)
+ # Try to break the line on semicolons, but if that doesn't
+ # work, try to split on folding whitespace.
+ while len(line) > maxlinelen:
+ i = line.rfind(';', 0, maxlinelen)
+ if i < 0:
+ break
+ rtn.append(line[:i] + ';')
+ line = line[i+1:]
+ # Is the remaining stuff still longer than maxlinelen?
+ if len(line) <= maxlinelen:
+ # Splitting on semis worked
+ rtn.append(line)
+ continue
+ # Splitting on semis didn't finish the job. If it did any
+ # work at all, stick the remaining junk on the front of the
+ # `lines' sequence and let the next pass do its thing.
+ if len(line) <> oldlen:
+ lines.insert(0, line)
+ continue
+ # Otherwise, splitting on semis didn't help at all.
+ parts = re.split(r'(\s+)', line)
+ if len(parts) == 1 or (len(parts) == 3 and
+ parts[0].endswith(':')):
+ # This line can't be split on whitespace. There's now
+ # little we can do to get this into maxlinelen. BAW:
+ # We're still potentially breaking the RFC by possibly
+ # allowing lines longer than the absolute maximum of 998
+ # characters. For now, let it slide.
+ #
+ # len(parts) will be 1 if this line has no `Field: '
+ # prefix, otherwise it will be len(3).
+ rtn.append(line)
+ continue
+ # There is whitespace we can split on.
+ first = parts.pop(0)
+ sublines = [first]
+ acc = len(first)
+ while parts:
+ len0 = len(parts[0])
+ len1 = len(parts[1])
+ if acc + len0 + len1 <= maxlinelen:
+ sublines.append(parts.pop(0))
+ sublines.append(parts.pop(0))
+ acc += len0 + len1
+ else:
+ # Split it here, but don't forget to ignore the
+ # next whitespace-only part
+ if first <> '':
+ rtn.append(EMPTYSTRING.join(sublines))
+ del parts[0]
+ first = parts.pop(0)
+ sublines = [first]
+ acc = len(first)
+ rtn.append(EMPTYSTRING.join(sublines))
+ return [(chunk, charset) for chunk in rtn]
+
+ def _encode_chunks(self):
"""MIME-encode a header with many different charsets and/or encodings.
Given a list of pairs (string, charset), return a MIME-encoded string
@@ -219,9 +286,35 @@ class Header:
"""
chunks = []
for header, charset in self._chunks:
- if charset is None:
- _max_append(chunks, header, self._maxlinelen, ' ')
+ if charset is None or charset.header_encoding is None:
+ # There's no encoding for this chunk's charsets
+ _max_append(chunks, header, self._maxlinelen)
else:
_max_append(chunks, charset.header_encode(header, 0),
self._maxlinelen, ' ')
- return NLSPACE.join(chunks)
+ joiner = NL + self._continuation_ws
+ return joiner.join(chunks)
+
+ def encode(self):
+ """Encode a message header, possibly converting charset and encoding.
+
+ There are many issues involved in converting a given string for use in
+ an email header. Only certain character sets are readable in most
+ email clients, and as header strings can only contain a subset of
+ 7-bit ASCII, care must be taken to properly convert and encode (with
+ Base64 or quoted-printable) header strings. In addition, there is a
+ 75-character length limit on any given encoded header field, so
+ line-wrapping must be performed, even with double-byte character sets.
+
+ This method will do its best to convert the string to the correct
+ character set used in email, and encode and line wrap it safely with
+ the appropriate scheme for that character set.
+
+ If the given charset is not known or an error occurs during
+ conversion, this function will return the header untouched.
+ """
+ newchunks = []
+ for s, charset in self._chunks:
+ newchunks += self._split(s, charset, 1)
+ self._chunks = newchunks
+ return self._encode_chunks()