From 9253214fd9fe22b8b2b4ca5bb28952df8cab3e8c Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 7 Jan 2011 23:25:30 +0000 Subject: #10686: recode non-ASCII headers to 'unknown-8bit' instead of ?s. This applies only when generating strings from non-RFC compliant binary input; it makes the existing recoding behavior more consistent (ie: now no data is lost when recoding). --- Doc/library/email.generator.rst | 4 +- Doc/library/email.header.rst | 10 ++++- Doc/library/email.message.rst | 7 ++-- Doc/whatsnew/3.2.rst | 2 + Lib/email/charset.py | 22 ++++++++--- Lib/email/header.py | 9 ++++- Lib/email/message.py | 25 ++++++------ Lib/email/test/test_email.py | 88 +++++++++++++++++++++++++---------------- Misc/NEWS | 4 ++ 9 files changed, 109 insertions(+), 62 deletions(-) diff --git a/Doc/library/email.generator.rst b/Doc/library/email.generator.rst index 22d8b09..85b32fe 100644 --- a/Doc/library/email.generator.rst +++ b/Doc/library/email.generator.rst @@ -79,8 +79,8 @@ Here are the public methods of the :class:`Generator` class, imported from the Messages parsed with a Bytes parser that have a :mailheader:`Content-Transfer-Encoding` of 8bit will be converted to a - use a 7bit Content-Transfer-Encoding. Any other non-ASCII bytes in the - message structure will be converted to '?' characters. + use a 7bit Content-Transfer-Encoding. Non-ASCII bytes in the headers + will be :rfc:`2047` encoded with a charset of `unknown-8bit`. .. versionchanged:: 3.2 Added support for re-encoding 8bit message bodies, and the *linesep* diff --git a/Doc/library/email.header.rst b/Doc/library/email.header.rst index ff2b484..29752c4 100644 --- a/Doc/library/email.header.rst +++ b/Doc/library/email.header.rst @@ -130,8 +130,14 @@ Here is the :class:`Header` class description: .. method:: __str__() - A helper for :class:`str`'s :func:`encode` method. Returns the header as - a Unicode string. + Returns an approximation of the :class:`Header` as a string, using an + unlimited line length. All pieces are converted to unicode using the + specified encoding and joined together appropriately. Any pieces with a + charset of `unknown-8bit` are decoded as `ASCII` using the `replace` + error handler. + + .. versionchanged:: 3.2 + Added handling for the `unknown-8bit` charset. .. method:: __eq__(other) diff --git a/Doc/library/email.message.rst b/Doc/library/email.message.rst index e76e689..29f7ba3 100644 --- a/Doc/library/email.message.rst +++ b/Doc/library/email.message.rst @@ -169,9 +169,10 @@ Here are the methods of the :class:`Message` class: Note that in all cases, any envelope header present in the message is not included in the mapping interface. - In a model generated from bytes, any header values that (in contravention - of the RFCs) contain non-ASCII bytes will have those bytes transformed - into '?' characters when the values are retrieved through this interface. + In a model generated from bytes, any header values that (in contravention of + the RFCs) contain non-ASCII bytes will, when retrieved through this + interface, be represented as :class:`~email.header.Header` objects with + a charset of `unknown-8bit`. .. method:: __len__() diff --git a/Doc/whatsnew/3.2.rst b/Doc/whatsnew/3.2.rst index b6e2550..69b318e 100644 --- a/Doc/whatsnew/3.2.rst +++ b/Doc/whatsnew/3.2.rst @@ -618,6 +618,8 @@ format. * Given bytes input to the model, :class:`~email.generator.Generator` will convert message bodies that have a :mailheader:`Content-Transfer-Encoding` of *8bit* to instead have a *7bit* :mailheader:`Content-Transfer-Encoding`. + XXX: Headers with Un-encoded non-ASCII bytes will be :rfc:`2047`\ -encoded + using the charset `unknown-8bit`. * A new class :class:`~email.generator.BytesGenerator` produces bytes as output, preserving any unchanged non-ASCII data that was present in the input used to diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 898beed..8591527 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers RFC2047_CHROME_LEN = 7 DEFAULT_CHARSET = 'us-ascii' +UNKNOWN8BIT = 'unknown-8bit' EMPTYSTRING = '' @@ -153,6 +154,16 @@ def add_codec(charset, codecname): +# Convenience function for encoding strings, taking into account +# that they might be unknown-8bit (ie: have surrogate-escaped bytes) +def _encode(string, codec): + if codec == UNKNOWN8BIT: + return string.encode('ascii', 'surrogateescape') + else: + return string.encode(codec) + + + class Charset: """Map character sets to their email properties. @@ -282,8 +293,7 @@ class Charset: :return: The encoded string, with RFC 2047 chrome. """ codec = self.output_codec or 'us-ascii' - charset = self.get_output_charset() - header_bytes = string.encode(codec) + header_bytes = _encode(string, codec) # 7bit/8bit encodings return the string unchanged (modulo conversions) encoder_module = self._get_encoder(header_bytes) if encoder_module is None: @@ -309,7 +319,7 @@ class Charset: """ # See which encoding we should use. codec = self.output_codec or 'us-ascii' - header_bytes = string.encode(codec) + header_bytes = _encode(string, codec) encoder_module = self._get_encoder(header_bytes) encoder = partial(encoder_module.header_encode, charset=str(self)) # Calculate the number of characters that the RFC 2047 chrome will @@ -333,7 +343,7 @@ class Charset: for character in string: current_line.append(character) this_line = EMPTYSTRING.join(current_line) - length = encoder_module.header_length(this_line.encode(charset)) + length = encoder_module.header_length(_encode(this_line, charset)) if length > maxlen: # This last character doesn't fit so pop it off. current_line.pop() @@ -343,12 +353,12 @@ class Charset: else: separator = (' ' if lines else '') joined_line = EMPTYSTRING.join(current_line) - header_bytes = joined_line.encode(codec) + header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) current_line = [character] maxlen = next(maxlengths) - extra joined_line = EMPTYSTRING.join(current_line) - header_bytes = joined_line.encode(codec) + header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) return lines diff --git a/Lib/email/header.py b/Lib/email/header.py index d462bf0..f90883f 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -17,7 +17,8 @@ import email.quoprimime import email.base64mime from email.errors import HeaderParseError -from email.charset import Charset +from email import charset as _charset +Charset = _charset.Charset NL = '\n' SPACE = ' ' @@ -210,6 +211,9 @@ class Header: # from a charset to None/us-ascii, or from None/us-ascii to a # charset. Only do this for the second and subsequent chunks. nextcs = charset + if nextcs == _charset.UNKNOWN8BIT: + original_bytes = string.encode('ascii', 'surrogateescape') + string = original_bytes.decode('ascii', 'replace') if uchunks: if lastcs not in (None, 'us-ascii'): if nextcs in (None, 'us-ascii'): @@ -263,7 +267,8 @@ class Header: # Ensure that the bytes we're storing can be decoded to the output # character set, otherwise an early error is thrown. output_charset = charset.output_codec or 'us-ascii' - s.encode(output_charset, errors) + if output_charset != _charset.UNKNOWN8BIT: + s.encode(output_charset, errors) self._chunks.append((s, charset)) def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'): diff --git a/Lib/email/message.py b/Lib/email/message.py index d2483ca..b821bfd 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -16,7 +16,9 @@ from io import BytesIO, StringIO # Intrapackage imports from email import utils from email import errors -from email.charset import Charset +from email import header +from email import charset as _charset +Charset = _charset.Charset SEMISPACE = '; ' @@ -31,16 +33,15 @@ _has_surrogates = re.compile( # Helper functions -def _sanitize_surrogates(value): - # If the value contains surrogates, re-decode and replace the original - # non-ascii bytes with '?'s. Used to sanitize header values before letting - # them escape as strings. +def _sanitize_header(name, value): + # If the header value contains surrogates, return a Header using + # the unknown-8bit charset to encode the bytes as encoded words. if not isinstance(value, str): - # Header object + # Assume it is already a header object return value if _has_surrogates(value): - original_bytes = value.encode('ascii', 'surrogateescape') - return original_bytes.decode('ascii', 'replace').replace('\ufffd', '?') + return header.Header(value, charset=_charset.UNKNOWN8BIT, + header_name=name) else: return value @@ -398,7 +399,7 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return [_sanitize_surrogates(v) for k, v in self._headers] + return [_sanitize_header(k, v) for k, v in self._headers] def items(self): """Get all the message's header fields and values. @@ -408,7 +409,7 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return [(k, _sanitize_surrogates(v)) for k, v in self._headers] + return [(k, _sanitize_header(k, v)) for k, v in self._headers] def get(self, name, failobj=None): """Get a header value. @@ -419,7 +420,7 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - return _sanitize_surrogates(v) + return _sanitize_header(k, v) return failobj # @@ -439,7 +440,7 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - values.append(_sanitize_surrogates(v)) + values.append(_sanitize_header(k, v)) if not values: return failobj return values diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index 4855371..a1798ce 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -2841,7 +2841,7 @@ class Test8BitBytesHandling(unittest.TestCase): cte='8bit', bodyline='pöstal').encode('utf-8') msg = email.message_from_bytes(m) - self.assertEqual(msg.get_payload(), "p��stal\n") + self.assertEqual(msg.get_payload(), "p\uFFFD\uFFFDstal\n") self.assertEqual(msg.get_payload(decode=True), "pöstal\n".encode('utf-8')) @@ -2874,7 +2874,7 @@ class Test8BitBytesHandling(unittest.TestCase): cte='quoted-printable', bodyline='p=C3=B6stál').encode('utf-8') msg = email.message_from_bytes(m) - self.assertEqual(msg.get_payload(), 'p=C3=B6st��l\n') + self.assertEqual(msg.get_payload(), 'p=C3=B6st\uFFFD\uFFFDl\n') self.assertEqual(msg.get_payload(decode=True), 'pöstál\n'.encode('utf-8')) @@ -2899,52 +2899,65 @@ class Test8BitBytesHandling(unittest.TestCase): '<,.V