summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorR. David Murray <rdmurray@bitdance.com>2011-01-07 23:25:30 (GMT)
committerR. David Murray <rdmurray@bitdance.com>2011-01-07 23:25:30 (GMT)
commit9253214fd9fe22b8b2b4ca5bb28952df8cab3e8c (patch)
tree30d925a75c0b3bd542c00d6dbd667e72178056a7
parent6f0022d84af15d51ffa1606991f2b6e9e56448ed (diff)
downloadcpython-9253214fd9fe22b8b2b4ca5bb28952df8cab3e8c.zip
cpython-9253214fd9fe22b8b2b4ca5bb28952df8cab3e8c.tar.gz
cpython-9253214fd9fe22b8b2b4ca5bb28952df8cab3e8c.tar.bz2
#10686: recode non-ASCII headers to 'unknown-8bit' instead of ?s.
This applies only when generating strings from non-RFC compliant binary input; it makes the existing recoding behavior more consistent (ie: now no data is lost when recoding).
-rw-r--r--Doc/library/email.generator.rst4
-rw-r--r--Doc/library/email.header.rst10
-rw-r--r--Doc/library/email.message.rst7
-rw-r--r--Doc/whatsnew/3.2.rst2
-rw-r--r--Lib/email/charset.py22
-rw-r--r--Lib/email/header.py9
-rw-r--r--Lib/email/message.py25
-rw-r--r--Lib/email/test/test_email.py88
-rw-r--r--Misc/NEWS4
9 files changed, 109 insertions, 62 deletions
diff --git a/Doc/library/email.generator.rst b/Doc/library/email.generator.rst
index 22d8b09..85b32fe 100644
--- a/Doc/library/email.generator.rst
+++ b/Doc/library/email.generator.rst
@@ -79,8 +79,8 @@ Here are the public methods of the :class:`Generator` class, imported from the
Messages parsed with a Bytes parser that have a
:mailheader:`Content-Transfer-Encoding` of 8bit will be converted to a
- use a 7bit Content-Transfer-Encoding. Any other non-ASCII bytes in the
- message structure will be converted to '?' characters.
+ use a 7bit Content-Transfer-Encoding. Non-ASCII bytes in the headers
+ will be :rfc:`2047` encoded with a charset of `unknown-8bit`.
.. versionchanged:: 3.2
Added support for re-encoding 8bit message bodies, and the *linesep*
diff --git a/Doc/library/email.header.rst b/Doc/library/email.header.rst
index ff2b484..29752c4 100644
--- a/Doc/library/email.header.rst
+++ b/Doc/library/email.header.rst
@@ -130,8 +130,14 @@ Here is the :class:`Header` class description:
.. method:: __str__()
- A helper for :class:`str`'s :func:`encode` method. Returns the header as
- a Unicode string.
+ Returns an approximation of the :class:`Header` as a string, using an
+ unlimited line length. All pieces are converted to unicode using the
+ specified encoding and joined together appropriately. Any pieces with a
+ charset of `unknown-8bit` are decoded as `ASCII` using the `replace`
+ error handler.
+
+ .. versionchanged:: 3.2
+ Added handling for the `unknown-8bit` charset.
.. method:: __eq__(other)
diff --git a/Doc/library/email.message.rst b/Doc/library/email.message.rst
index e76e689..29f7ba3 100644
--- a/Doc/library/email.message.rst
+++ b/Doc/library/email.message.rst
@@ -169,9 +169,10 @@ Here are the methods of the :class:`Message` class:
Note that in all cases, any envelope header present in the message is not
included in the mapping interface.
- In a model generated from bytes, any header values that (in contravention
- of the RFCs) contain non-ASCII bytes will have those bytes transformed
- into '?' characters when the values are retrieved through this interface.
+ In a model generated from bytes, any header values that (in contravention of
+ the RFCs) contain non-ASCII bytes will, when retrieved through this
+ interface, be represented as :class:`~email.header.Header` objects with
+ a charset of `unknown-8bit`.
.. method:: __len__()
diff --git a/Doc/whatsnew/3.2.rst b/Doc/whatsnew/3.2.rst
index b6e2550..69b318e 100644
--- a/Doc/whatsnew/3.2.rst
+++ b/Doc/whatsnew/3.2.rst
@@ -618,6 +618,8 @@ format.
* Given bytes input to the model, :class:`~email.generator.Generator` will
convert message bodies that have a :mailheader:`Content-Transfer-Encoding` of
*8bit* to instead have a *7bit* :mailheader:`Content-Transfer-Encoding`.
+ XXX: Headers with Un-encoded non-ASCII bytes will be :rfc:`2047`\ -encoded
+ using the charset `unknown-8bit`.
* A new class :class:`~email.generator.BytesGenerator` produces bytes as output,
preserving any unchanged non-ASCII data that was present in the input used to
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 898beed..8591527 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers
RFC2047_CHROME_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
EMPTYSTRING = ''
@@ -153,6 +154,16 @@ def add_codec(charset, codecname):
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+ if codec == UNKNOWN8BIT:
+ return string.encode('ascii', 'surrogateescape')
+ else:
+ return string.encode(codec)
+
+
+
class Charset:
"""Map character sets to their email properties.
@@ -282,8 +293,7 @@ class Charset:
:return: The encoded string, with RFC 2047 chrome.
"""
codec = self.output_codec or 'us-ascii'
- charset = self.get_output_charset()
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
encoder_module = self._get_encoder(header_bytes)
if encoder_module is None:
@@ -309,7 +319,7 @@ class Charset:
"""
# See which encoding we should use.
codec = self.output_codec or 'us-ascii'
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
encoder_module = self._get_encoder(header_bytes)
encoder = partial(encoder_module.header_encode, charset=str(self))
# Calculate the number of characters that the RFC 2047 chrome will
@@ -333,7 +343,7 @@ class Charset:
for character in string:
current_line.append(character)
this_line = EMPTYSTRING.join(current_line)
- length = encoder_module.header_length(this_line.encode(charset))
+ length = encoder_module.header_length(_encode(this_line, charset))
if length > maxlen:
# This last character doesn't fit so pop it off.
current_line.pop()
@@ -343,12 +353,12 @@ class Charset:
else:
separator = (' ' if lines else '')
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
current_line = [character]
maxlen = next(maxlengths) - extra
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
return lines
diff --git a/Lib/email/header.py b/Lib/email/header.py
index d462bf0..f90883f 100644
--- a/Lib/email/header.py
+++ b/Lib/email/header.py
@@ -17,7 +17,8 @@ import email.quoprimime
import email.base64mime
from email.errors import HeaderParseError
-from email.charset import Charset
+from email import charset as _charset
+Charset = _charset.Charset
NL = '\n'
SPACE = ' '
@@ -210,6 +211,9 @@ class Header:
# from a charset to None/us-ascii, or from None/us-ascii to a
# charset. Only do this for the second and subsequent chunks.
nextcs = charset
+ if nextcs == _charset.UNKNOWN8BIT:
+ original_bytes = string.encode('ascii', 'surrogateescape')
+ string = original_bytes.decode('ascii', 'replace')
if uchunks:
if lastcs not in (None, 'us-ascii'):
if nextcs in (None, 'us-ascii'):
@@ -263,7 +267,8 @@ class Header:
# Ensure that the bytes we're storing can be decoded to the output
# character set, otherwise an early error is thrown.
output_charset = charset.output_codec or 'us-ascii'
- s.encode(output_charset, errors)
+ if output_charset != _charset.UNKNOWN8BIT:
+ s.encode(output_charset, errors)
self._chunks.append((s, charset))
def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
diff --git a/Lib/email/message.py b/Lib/email/message.py
index d2483ca..b821bfd 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -16,7 +16,9 @@ from io import BytesIO, StringIO
# Intrapackage imports
from email import utils
from email import errors
-from email.charset import Charset
+from email import header
+from email import charset as _charset
+Charset = _charset.Charset
SEMISPACE = '; '
@@ -31,16 +33,15 @@ _has_surrogates = re.compile(
# Helper functions
-def _sanitize_surrogates(value):
- # If the value contains surrogates, re-decode and replace the original
- # non-ascii bytes with '?'s. Used to sanitize header values before letting
- # them escape as strings.
+def _sanitize_header(name, value):
+ # If the header value contains surrogates, return a Header using
+ # the unknown-8bit charset to encode the bytes as encoded words.
if not isinstance(value, str):
- # Header object
+ # Assume it is already a header object
return value
if _has_surrogates(value):
- original_bytes = value.encode('ascii', 'surrogateescape')
- return original_bytes.decode('ascii', 'replace').replace('\ufffd', '?')
+ return header.Header(value, charset=_charset.UNKNOWN8BIT,
+ header_name=name)
else:
return value
@@ -398,7 +399,7 @@ class Message:
Any fields deleted and re-inserted are always appended to the header
list.
"""
- return [_sanitize_surrogates(v) for k, v in self._headers]
+ return [_sanitize_header(k, v) for k, v in self._headers]
def items(self):
"""Get all the message's header fields and values.
@@ -408,7 +409,7 @@ class Message:
Any fields deleted and re-inserted are always appended to the header
list.
"""
- return [(k, _sanitize_surrogates(v)) for k, v in self._headers]
+ return [(k, _sanitize_header(k, v)) for k, v in self._headers]
def get(self, name, failobj=None):
"""Get a header value.
@@ -419,7 +420,7 @@ class Message:
name = name.lower()
for k, v in self._headers:
if k.lower() == name:
- return _sanitize_surrogates(v)
+ return _sanitize_header(k, v)
return failobj
#
@@ -439,7 +440,7 @@ class Message:
name = name.lower()
for k, v in self._headers:
if k.lower() == name:
- values.append(_sanitize_surrogates(v))
+ values.append(_sanitize_header(k, v))
if not values:
return failobj
return values
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index 4855371..a1798ce 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -2841,7 +2841,7 @@ class Test8BitBytesHandling(unittest.TestCase):
cte='8bit',
bodyline='pöstal').encode('utf-8')
msg = email.message_from_bytes(m)
- self.assertEqual(msg.get_payload(), "p��stal\n")
+ self.assertEqual(msg.get_payload(), "p\uFFFD\uFFFDstal\n")
self.assertEqual(msg.get_payload(decode=True),
"pöstal\n".encode('utf-8'))
@@ -2874,7 +2874,7 @@ class Test8BitBytesHandling(unittest.TestCase):
cte='quoted-printable',
bodyline='p=C3=B6stál').encode('utf-8')
msg = email.message_from_bytes(m)
- self.assertEqual(msg.get_payload(), 'p=C3=B6st��l\n')
+ self.assertEqual(msg.get_payload(), 'p=C3=B6st\uFFFD\uFFFDl\n')
self.assertEqual(msg.get_payload(decode=True),
'pöstál\n'.encode('utf-8'))
@@ -2899,52 +2899,65 @@ class Test8BitBytesHandling(unittest.TestCase):
'<,.V<W1A; á \n'.encode('utf-8'))
- headertest_msg = textwrap.dedent("""\
- From: foo@bar.com
- To: báz
- Subject: Maintenant je vous présente mon collègue, le pouf célèbre
- \tJean de Baddie
- From: göst
-
- Yes, they are flying.
- """).encode('utf-8')
+ headertest_headers = (
+ ('From: foo@bar.com', ('From', 'foo@bar.com')),
+ ('To: báz', ('To', '=?unknown-8bit?q?b=C3=A1z?=')),
+ ('Subject: Maintenant je vous présente mon collègue, le pouf célèbre\n'
+ '\tJean de Baddie',
+ ('Subject', '=?unknown-8bit?q?Maintenant_je_vous_pr=C3=A9sente_mon_'
+ 'coll=C3=A8gue=2C_le_pouf_c=C3=A9l=C3=A8bre?=\n'
+ ' =?unknown-8bit?q?_Jean_de_Baddie?=')),
+ ('From: göst', ('From', '=?unknown-8bit?b?Z8O2c3Q=?=')),
+ )
+ headertest_msg = ('\n'.join([src for (src, _) in headertest_headers]) +
+ '\nYes, they are flying.\n').encode('utf-8')
def test_get_8bit_header(self):
msg = email.message_from_bytes(self.headertest_msg)
- self.assertEqual(msg.get('to'), 'b??z')
- self.assertEqual(msg['to'], 'b??z')
+ self.assertEqual(str(msg.get('to')), 'b\uFFFD\uFFFDz')
+ self.assertEqual(str(msg['to']), 'b\uFFFD\uFFFDz')
def test_print_8bit_headers(self):
msg = email.message_from_bytes(self.headertest_msg)
self.assertEqual(str(msg),
- self.headertest_msg.decode(
- 'ascii', 'replace').replace('�', '?'))
+ textwrap.dedent("""\
+ From: {}
+ To: {}
+ Subject: {}
+ From: {}
+
+ Yes, they are flying.
+ """).format(*[expected[1] for (_, expected) in
+ self.headertest_headers]))
def test_values_with_8bit_headers(self):
msg = email.message_from_bytes(self.headertest_msg)
- self.assertListEqual(msg.values(),
+ self.assertListEqual([str(x) for x in msg.values()],
['foo@bar.com',
- 'b??z',
- 'Maintenant je vous pr??sente mon '
- 'coll??gue, le pouf c??l??bre\n'
+ 'b\uFFFD\uFFFDz',
+ 'Maintenant je vous pr\uFFFD\uFFFDsente mon '
+ 'coll\uFFFD\uFFFDgue, le pouf '
+ 'c\uFFFD\uFFFDl\uFFFD\uFFFDbre\n'
'\tJean de Baddie',
- "g??st"])
+ "g\uFFFD\uFFFDst"])
def test_items_with_8bit_headers(self):
msg = email.message_from_bytes(self.headertest_msg)
- self.assertListEqual(msg.items(),
+ self.assertListEqual([(str(x), str(y)) for (x, y) in msg.items()],
[('From', 'foo@bar.com'),
- ('To', 'b??z'),
- ('Subject', 'Maintenant je vous pr??sente mon '
- 'coll??gue, le pouf c??l??bre\n'
- '\tJean de Baddie'),
- ('From', 'g??st')])
+ ('To', 'b\uFFFD\uFFFDz'),
+ ('Subject', 'Maintenant je vous '
+ 'pr\uFFFD\uFFFDsente '
+ 'mon coll\uFFFD\uFFFDgue, le pouf '
+ 'c\uFFFD\uFFFDl\uFFFD\uFFFDbre\n'
+ '\tJean de Baddie'),
+ ('From', 'g\uFFFD\uFFFDst')])
def test_get_all_with_8bit_headers(self):
msg = email.message_from_bytes(self.headertest_msg)
- self.assertListEqual(msg.get_all('from'),
+ self.assertListEqual([str(x) for x in msg.get_all('from')],
['foo@bar.com',
- 'g??st'])
+ 'g\uFFFD\uFFFDst'])
non_latin_bin_msg = textwrap.dedent("""\
From: foo@bar.com
@@ -2964,13 +2977,12 @@ class Test8BitBytesHandling(unittest.TestCase):
email.generator.BytesGenerator(out).flatten(msg)
self.assertEqual(out.getvalue(), self.non_latin_bin_msg)
- # XXX: ultimately the '?' should turn into CTE encoded bytes
- # using 'unknown-8bit' charset.
- non_latin_bin_msg_as7bit = textwrap.dedent("""\
+ non_latin_bin_msg_as7bit_wrapped = textwrap.dedent("""\
From: foo@bar.com
- To: b??z
- Subject: Maintenant je vous pr??sente mon coll??gue, le pouf c??l??bre
- \tJean de Baddie
+ To: =?unknown-8bit?q?b=C3=A1z?=
+ Subject: =?unknown-8bit?q?Maintenant_je_vous_pr=C3=A9sente_mon_coll=C3=A8gue?=
+ =?unknown-8bit?q?=2C_le_pouf_c=C3=A9l=C3=A8bre?=
+ =?unknown-8bit?q?_Jean_de_Baddie?=
Mime-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
@@ -2982,7 +2994,7 @@ class Test8BitBytesHandling(unittest.TestCase):
msg = email.message_from_bytes(self.non_latin_bin_msg)
out = StringIO()
email.generator.Generator(out).flatten(msg)
- self.assertEqual(out.getvalue(), self.non_latin_bin_msg_as7bit)
+ self.assertEqual(out.getvalue(), self.non_latin_bin_msg_as7bit_wrapped)
def test_bytes_generator_with_unix_from(self):
# The unixfrom contains a current date, so we can't check it
@@ -2995,6 +3007,12 @@ class Test8BitBytesHandling(unittest.TestCase):
self.assertEqual(lines[0].split()[0], b'From')
self.assertEqual(b'\n'.join(lines[1:]), self.non_latin_bin_msg)
+ non_latin_bin_msg_as7bit = non_latin_bin_msg_as7bit_wrapped.split('\n')
+ non_latin_bin_msg_as7bit[2:4] = [
+ 'Subject: =?unknown-8bit?q?Maintenant_je_vous_pr=C3=A9sente_mon_'
+ 'coll=C3=A8gue=2C_le_pouf_c=C3=A9l=C3=A8bre?=']
+ non_latin_bin_msg_as7bit = '\n'.join(non_latin_bin_msg_as7bit)
+
def test_message_from_binary_file(self):
fn = 'test.msg'
self.addCleanup(unlink, fn)
diff --git a/Misc/NEWS b/Misc/NEWS
index 5361574..8c7c7fd 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -40,6 +40,10 @@ Core and Builtins
Library
-------
+- Issue #10686: the email package now :rfc:`2047`\ -encodes headers with
+ non-ASCII bytes (parsed by a Bytes Parser) when doing conversion to
+ 7bit-clean presentation, instead of replacing them with ?s.
+
- email.header.Header was incorrectly encoding folding white space when
rfc2047-encoding header values with embedded newlines, leaving them
without folding whitespace. It now uses the continuation_ws, as it