summaryrefslogtreecommitdiffstats
path: root/Lib/email
diff options
context:
space:
mode:
authorR David Murray <rdmurray@bitdance.com>2012-06-02 21:56:49 (GMT)
committerR David Murray <rdmurray@bitdance.com>2012-06-02 21:56:49 (GMT)
commit07ea53cb218812404cdbde820647ce6e4b2d0f8e (patch)
tree153fbb31a5056379715475ed55a5c91a0fcbd8a9 /Lib/email
parente11eb0f21b8107d7cf61efd37ff3555258577d51 (diff)
downloadcpython-07ea53cb218812404cdbde820647ce6e4b2d0f8e.zip
cpython-07ea53cb218812404cdbde820647ce6e4b2d0f8e.tar.gz
cpython-07ea53cb218812404cdbde820647ce6e4b2d0f8e.tar.bz2
#1079: Fix parsing of encoded words.
This is a behavior change: before this leading and trailing spaces were stripped from ASCII parts, now they are preserved. Without this fix we didn't parse the examples in the RFC correctly, so I think breaking backward compatibility here is justified. Patch by Ralf Schlatterbeck.
Diffstat (limited to 'Lib/email')
-rw-r--r--Lib/email/header.py45
1 files changed, 41 insertions, 4 deletions
diff --git a/Lib/email/header.py b/Lib/email/header.py
index 3250d36..a89219d 100644
--- a/Lib/email/header.py
+++ b/Lib/email/header.py
@@ -40,7 +40,6 @@ ecre = re.compile(r'''
\? # literal ?
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
\?= # literal ?=
- (?=[ \t]|$) # whitespace or the end of the string
''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
# Field name regexp, including trailing colon, but not separating whitespace,
@@ -86,8 +85,12 @@ def decode_header(header):
words = []
for line in header.splitlines():
parts = ecre.split(line)
+ first = True
while parts:
- unencoded = parts.pop(0).strip()
+ unencoded = parts.pop(0)
+ if first:
+ unencoded = unencoded.lstrip()
+ first = False
if unencoded:
words.append((unencoded, None, None))
if parts:
@@ -95,6 +98,16 @@ def decode_header(header):
encoding = parts.pop(0).lower()
encoded = parts.pop(0)
words.append((encoded, encoding, charset))
+ # Now loop over words and remove words that consist of whitespace
+ # between two encoded strings.
+ import sys
+ droplist = []
+ for n, w in enumerate(words):
+ if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
+ droplist.append(n-1)
+ for d in reversed(droplist):
+ del words[d]
+
# The next step is to decode each encoded word by applying the reverse
# base64 or quopri transformation. decoded_words is now a list of the
# form (decoded_word, charset).
@@ -217,22 +230,27 @@ class Header:
self._normalize()
uchunks = []
lastcs = None
+ lastspace = None
for string, charset in self._chunks:
# We must preserve spaces between encoded and non-encoded word
# boundaries, which means for us we need to add a space when we go
# from a charset to None/us-ascii, or from None/us-ascii to a
# charset. Only do this for the second and subsequent chunks.
+ # Don't add a space if the None/us-ascii string already has
+ # a space (trailing or leading depending on transition)
nextcs = charset
if nextcs == _charset.UNKNOWN8BIT:
original_bytes = string.encode('ascii', 'surrogateescape')
string = original_bytes.decode('ascii', 'replace')
if uchunks:
+ hasspace = string and self._nonctext(string[0])
if lastcs not in (None, 'us-ascii'):
- if nextcs in (None, 'us-ascii'):
+ if nextcs in (None, 'us-ascii') and not hasspace:
uchunks.append(SPACE)
nextcs = None
- elif nextcs not in (None, 'us-ascii'):
+ elif nextcs not in (None, 'us-ascii') and not lastspace:
uchunks.append(SPACE)
+ lastspace = string and self._nonctext(string[-1])
lastcs = nextcs
uchunks.append(string)
return EMPTYSTRING.join(uchunks)
@@ -291,6 +309,11 @@ class Header:
charset = UTF8
self._chunks.append((s, charset))
+ def _nonctext(self, s):
+ """True if string s is not a ctext character of RFC822.
+ """
+ return s.isspace() or s in ('(', ')', '\\')
+
def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
r"""Encode a message header into an RFC-compliant format.
@@ -334,7 +357,20 @@ class Header:
maxlinelen = 1000000
formatter = _ValueFormatter(self._headerlen, maxlinelen,
self._continuation_ws, splitchars)
+ lastcs = None
+ hasspace = lastspace = None
for string, charset in self._chunks:
+ if hasspace is not None:
+ hasspace = string and self._nonctext(string[0])
+ import sys
+ if lastcs not in (None, 'us-ascii'):
+ if not hasspace or charset not in (None, 'us-ascii'):
+ formatter.add_transition()
+ elif charset not in (None, 'us-ascii') and not lastspace:
+ formatter.add_transition()
+ lastspace = string and self._nonctext(string[-1])
+ lastcs = charset
+ hasspace = False
lines = string.splitlines()
if lines:
formatter.feed('', lines[0], charset)
@@ -351,6 +387,7 @@ class Header:
formatter.feed(fws, sline, charset)
if len(lines) > 1:
formatter.newline()
+ if self._chunks:
formatter.add_transition()
value = formatter._str(linesep)
if _embeded_header.search(value):