diff options
author | R David Murray <rdmurray@bitdance.com> | 2012-06-02 21:56:49 (GMT) |
---|---|---|
committer | R David Murray <rdmurray@bitdance.com> | 2012-06-02 21:56:49 (GMT) |
commit | 07ea53cb218812404cdbde820647ce6e4b2d0f8e (patch) | |
tree | 153fbb31a5056379715475ed55a5c91a0fcbd8a9 /Lib/email | |
parent | e11eb0f21b8107d7cf61efd37ff3555258577d51 (diff) | |
download | cpython-07ea53cb218812404cdbde820647ce6e4b2d0f8e.zip cpython-07ea53cb218812404cdbde820647ce6e4b2d0f8e.tar.gz cpython-07ea53cb218812404cdbde820647ce6e4b2d0f8e.tar.bz2 |
#1079: Fix parsing of encoded words.
This is a behavior change: before this leading and trailing spaces were
stripped from ASCII parts, now they are preserved. Without this fix we didn't
parse the examples in the RFC correctly, so I think breaking backward
compatibility here is justified.
Patch by Ralf Schlatterbeck.
Diffstat (limited to 'Lib/email')
-rw-r--r-- | Lib/email/header.py | 45 |
1 files changed, 41 insertions, 4 deletions
diff --git a/Lib/email/header.py b/Lib/email/header.py index 3250d36..a89219d 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -40,7 +40,6 @@ ecre = re.compile(r''' \? # literal ? (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string \?= # literal ?= - (?=[ \t]|$) # whitespace or the end of the string ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE) # Field name regexp, including trailing colon, but not separating whitespace, @@ -86,8 +85,12 @@ def decode_header(header): words = [] for line in header.splitlines(): parts = ecre.split(line) + first = True while parts: - unencoded = parts.pop(0).strip() + unencoded = parts.pop(0) + if first: + unencoded = unencoded.lstrip() + first = False if unencoded: words.append((unencoded, None, None)) if parts: @@ -95,6 +98,16 @@ def decode_header(header): encoding = parts.pop(0).lower() encoded = parts.pop(0) words.append((encoded, encoding, charset)) + # Now loop over words and remove words that consist of whitespace + # between two encoded strings. + import sys + droplist = [] + for n, w in enumerate(words): + if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace(): + droplist.append(n-1) + for d in reversed(droplist): + del words[d] + # The next step is to decode each encoded word by applying the reverse # base64 or quopri transformation. decoded_words is now a list of the # form (decoded_word, charset). @@ -217,22 +230,27 @@ class Header: self._normalize() uchunks = [] lastcs = None + lastspace = None for string, charset in self._chunks: # We must preserve spaces between encoded and non-encoded word # boundaries, which means for us we need to add a space when we go # from a charset to None/us-ascii, or from None/us-ascii to a # charset. Only do this for the second and subsequent chunks. + # Don't add a space if the None/us-ascii string already has + # a space (trailing or leading depending on transition) nextcs = charset if nextcs == _charset.UNKNOWN8BIT: original_bytes = string.encode('ascii', 'surrogateescape') string = original_bytes.decode('ascii', 'replace') if uchunks: + hasspace = string and self._nonctext(string[0]) if lastcs not in (None, 'us-ascii'): - if nextcs in (None, 'us-ascii'): + if nextcs in (None, 'us-ascii') and not hasspace: uchunks.append(SPACE) nextcs = None - elif nextcs not in (None, 'us-ascii'): + elif nextcs not in (None, 'us-ascii') and not lastspace: uchunks.append(SPACE) + lastspace = string and self._nonctext(string[-1]) lastcs = nextcs uchunks.append(string) return EMPTYSTRING.join(uchunks) @@ -291,6 +309,11 @@ class Header: charset = UTF8 self._chunks.append((s, charset)) + def _nonctext(self, s): + """True if string s is not a ctext character of RFC822. + """ + return s.isspace() or s in ('(', ')', '\\') + def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'): r"""Encode a message header into an RFC-compliant format. @@ -334,7 +357,20 @@ class Header: maxlinelen = 1000000 formatter = _ValueFormatter(self._headerlen, maxlinelen, self._continuation_ws, splitchars) + lastcs = None + hasspace = lastspace = None for string, charset in self._chunks: + if hasspace is not None: + hasspace = string and self._nonctext(string[0]) + import sys + if lastcs not in (None, 'us-ascii'): + if not hasspace or charset not in (None, 'us-ascii'): + formatter.add_transition() + elif charset not in (None, 'us-ascii') and not lastspace: + formatter.add_transition() + lastspace = string and self._nonctext(string[-1]) + lastcs = charset + hasspace = False lines = string.splitlines() if lines: formatter.feed('', lines[0], charset) @@ -351,6 +387,7 @@ class Header: formatter.feed(fws, sline, charset) if len(lines) > 1: formatter.newline() + if self._chunks: formatter.add_transition() value = formatter._str(linesep) if _embeded_header.search(value): |