diff options
Diffstat (limited to 'Lib/email/header.py')
-rw-r--r-- | Lib/email/header.py | 136 |
1 files changed, 96 insertions, 40 deletions
diff --git a/Lib/email/header.py b/Lib/email/header.py index 1d97f8f..e03e42d 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -25,10 +25,11 @@ BSPACE = b' ' SPACE8 = ' ' * 8 EMPTYSTRING = '' -MAXLINELEN = 76 +MAXLINELEN = 78 USASCII = Charset('us-ascii') UTF8 = Charset('utf-8') +TRANSITIONAL_SPACE = object() # Match encoded-word strings in the form =?charset?q?Hello_World?= ecre = re.compile(r''' @@ -109,7 +110,7 @@ def decode_header(header): last_word = last_charset = None for word, charset in decoded_words: if isinstance(word, str): - word = bytes(ord(c) for c in word) + word = bytes(word, 'raw-unicode-escape') if last_word is None: last_word = word last_charset = charset @@ -170,7 +171,8 @@ class Header: The maximum line length can be specified explicit via maxlinelen. For splitting the first line to a shorter value (to account for the field header which isn't included in s, e.g. `Subject') pass in the name of - the field in header_name. The default maxlinelen is 76. + the field in header_name. The default maxlinelen is 78 as recommended + by RFC 2822. continuation_ws must be RFC 2822 compliant folding whitespace (usually either a space or a hard tab) which will be prepended to continuation @@ -198,9 +200,10 @@ class Header: def __str__(self): """Return the string value of the header.""" + self._normalize() uchunks = [] lastcs = None - for s, charset in self._chunks: + for string, charset in self._chunks: # We must preserve spaces between encoded and non-encoded word # boundaries, which means for us we need to add a space when we go # from a charset to None/us-ascii, or from None/us-ascii to a @@ -214,15 +217,16 @@ class Header: elif nextcs not in (None, 'us-ascii'): uchunks.append(SPACE) lastcs = nextcs - uchunks.append(s) + uchunks.append(string) return EMPTYSTRING.join(uchunks) # Rich comparison operators for equality only. BAW: does it make sense to # have or explicitly disable <, <=, >, >= operators? def __eq__(self, other): # other may be a Header or a string. Both are fine so coerce - # ourselves to a string, swap the args and do another comparison. - return other == self.encode() + # ourselves to a unicode (of the unencoded header value), swap the + # args and do another comparison. + return other == str(self) def __ne__(self, other): return not self == other @@ -267,7 +271,7 @@ class Header: output_string = input_bytes.decode(output_charset, errors) self._chunks.append((output_string, charset)) - def encode(self, splitchars=';, \t'): + def encode(self, splitchars=';, \t', maxlinelen=None): """Encode a message header into an RFC-compliant format. There are many issues involved in converting a given string for use in @@ -290,7 +294,14 @@ class Header: syntactic breaks'. This doesn't affect RFC 2047 encoded lines. """ self._normalize() - formatter = _ValueFormatter(self._headerlen, self._maxlinelen, + if maxlinelen is None: + maxlinelen = self._maxlinelen + # A maxlinelen of 0 means don't wrap. For all practical purposes, + # choosing a huge number here accomplishes that and makes the + # _ValueFormatter algorithm much simpler. + if maxlinelen == 0: + maxlinelen = 1000000 + formatter = _ValueFormatter(self._headerlen, maxlinelen, self._continuation_ws, splitchars) for string, charset in self._chunks: lines = string.splitlines() @@ -301,9 +312,8 @@ class Header: return str(formatter) def _normalize(self): - # Normalize the chunks so that all runs of identical charsets get - # collapsed into a single unicode string. You need a space between - # encoded words, or between encoded and unencoded words. + # Step 1: Normalize the chunks so that all runs of identical charsets + # get collapsed into a single unicode string. chunks = [] last_charset = None last_chunk = [] @@ -313,8 +323,6 @@ class Header: else: if last_charset is not None: chunks.append((SPACE.join(last_chunk), last_charset)) - if last_charset != USASCII or charset != USASCII: - chunks.append((' ', USASCII)) last_chunk = [string] last_charset = charset if last_chunk: @@ -333,6 +341,10 @@ class _ValueFormatter: self._current_line = _Accumulator(headerlen) def __str__(self): + # Remove the trailing TRANSITIONAL_SPACE + last_line = self._current_line.pop() + if last_line is not TRANSITIONAL_SPACE: + self._current_line.push(last_line) self.newline() return NL.join(self._lines) @@ -348,24 +360,66 @@ class _ValueFormatter: if len(encoded_string) + len(self._current_line) <= self._maxlen: self._current_line.push(encoded_string) return - # Attempt to split the line at the highest-level syntactic break - # possible. Note that we don't have a lot of smarts about field + # If the charset has no header encoding (i.e. it is an ASCII encoding) + # then we must split the header at the "highest level syntactic break" + # possible. Note that we don't have a lot of smarts about field # syntax; we just try to break on semi-colons, then commas, then - # whitespace. Eventually, we'll allow this to be pluggable. - for ch in self._splitchars: - if ch in string: - break - else: - # We can't split the string to fit on the current line, so just - # put it on a line by itself. - self._lines.append(str(self._current_line)) - self._current_line.reset(self._continuation_ws) - self._current_line.push(encoded_string) + # whitespace. Eventually, this should be pluggable. + if charset.header_encoding is None: + for ch in self._splitchars: + if ch in string: + break + else: + ch = None + # If there's no available split character then regardless of + # whether the string fits on the line, we have to put it on a line + # by itself. + if ch is None: + if not self._current_line.is_onlyws(): + self._lines.append(str(self._current_line)) + self._current_line.reset(self._continuation_ws) + self._current_line.push(encoded_string) + else: + self._ascii_split(string, ch) return - self._spliterate(string, ch, charset) - - def _spliterate(self, string, ch, charset): - holding = _Accumulator(transformfunc=charset.header_encode) + # Otherwise, we're doing either a Base64 or a quoted-printable + # encoding which means we don't need to split the line on syntactic + # breaks. We can basically just find enough characters to fit on the + # current line, minus the RFC 2047 chrome. What makes this trickier + # though is that we have to split at octet boundaries, not character + # boundaries but it's only safe to split at character boundaries so at + # best we can only get close. + encoded_lines = charset.header_encode_lines(string, self._maxlengths()) + # The first element extends the current line, but if it's None then + # nothing more fit on the current line so start a new line. + try: + first_line = encoded_lines.pop(0) + except IndexError: + # There are no encoded lines, so we're done. + return + if first_line is not None: + self._current_line.push(first_line) + self._lines.append(str(self._current_line)) + self._current_line.reset(self._continuation_ws) + try: + last_line = encoded_lines.pop() + except IndexError: + # There was only one line. + return + self._current_line.push(last_line) + self._current_line.push(TRANSITIONAL_SPACE) + # Everything else are full lines in themselves. + for line in encoded_lines: + self._lines.append(self._continuation_ws + line) + + def _maxlengths(self): + # The first line's length. + yield self._maxlen - len(self._current_line) + while True: + yield self._maxlen - self._continuation_ws_len + + def _ascii_split(self, string, ch): + holding = _Accumulator() # Split the line on the split character, preserving it. If the split # character is whitespace RFC 2822 $2.2.3 requires us to fold on the # whitespace, so that the line leads with the original whitespace we @@ -387,8 +441,7 @@ class _ValueFormatter: # line, watch out for the current line containing only # whitespace. holding.pop() - if len(self._current_line) == 0 and ( - len(holding) == 0 or str(holding).isspace()): + if self._current_line.is_onlyws() and holding.is_onlyws(): # Don't start a new line. holding.push(part) part = None @@ -492,12 +545,8 @@ def _spliterator(character, string): class _Accumulator: - def __init__(self, initial_size=0, transformfunc=None): + def __init__(self, initial_size=0): self._initial_size = initial_size - if transformfunc is None: - self._transformfunc = lambda string: string - else: - self._transformfunc = transformfunc self._current = [] def push(self, string): @@ -507,14 +556,21 @@ class _Accumulator: return self._current.pop() def __len__(self): - return len(str(self)) + self._initial_size + return sum((len(string) + for string in self._current + if string is not TRANSITIONAL_SPACE), + self._initial_size) def __str__(self): - return self._transformfunc(EMPTYSTRING.join(self._current)) + return EMPTYSTRING.join( + (' ' if string is TRANSITIONAL_SPACE else string) + for string in self._current) def reset(self, string=None): self._current = [] - self._current_len = 0 self._initial_size = 0 if string is not None: self.push(string) + + def is_onlyws(self): + return len(self) == 0 or str(self).isspace() |