diff options
author | R. David Murray <rdmurray@bitdance.com> | 2010-10-08 15:55:28 (GMT) |
---|---|---|
committer | R. David Murray <rdmurray@bitdance.com> | 2010-10-08 15:55:28 (GMT) |
commit | 96fd54eaec700cc50e5960f45ee79bc25c2c48c5 (patch) | |
tree | 4e4fc3f48d8957b6b0fccc372410e8374ce4fb70 /Lib/email/message.py | |
parent | 59fdd6736bbf1ba14083a4bb777abaefc364f876 (diff) | |
download | cpython-96fd54eaec700cc50e5960f45ee79bc25c2c48c5.zip cpython-96fd54eaec700cc50e5960f45ee79bc25c2c48c5.tar.gz cpython-96fd54eaec700cc50e5960f45ee79bc25c2c48c5.tar.bz2 |
#4661: add bytes parsing and generation to email (email version bump to 5.1.0)
The work on this is not 100% complete, but everything is present to
allow real-world testing of the code. The only remaining major todo
item is to (hopefully!) enhance the handling of non-ASCII bytes in headers
converted to unicode by RFC2047 encoding them rather than replacing them with
'?'s.
Diffstat (limited to 'Lib/email/message.py')
-rw-r--r-- | Lib/email/message.py | 98 |
1 files changed, 72 insertions, 26 deletions
diff --git a/Lib/email/message.py b/Lib/email/message.py index 923b26c..a835ce2 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -24,8 +24,26 @@ SEMISPACE = '; ' # existence of which force quoting of the parameter value. tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]') +# How to figure out if we are processing strings that come from a byte +# source with undecodable characters. +_has_surrogates = re.compile( + '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search + # Helper functions +def _sanitize_surrogates(value): + # If the value contains surrogates, re-decode and replace the original + # non-ascii bytes with '?'s. Used to sanitize header values before letting + # them escape as strings. + if not isinstance(value, str): + # Header object + return value + if _has_surrogates(value): + original_bytes = value.encode('ascii', 'surrogateescape') + return original_bytes.decode('ascii', 'replace').replace('\ufffd', '?') + else: + return value + def _splitparam(param): # Split header parameters. BAW: this may be too simple. It isn't # strictly RFC 2045 (section 5.1) compliant, but it catches most headers @@ -184,44 +202,72 @@ class Message: If the message is a multipart and the decode flag is True, then None is returned. """ - if i is None: - payload = self._payload - elif not isinstance(self._payload, list): + # Here is the logic table for this code, based on the email5.0.0 code: + # i decode is_multipart result + # ------ ------ ------------ ------------------------------ + # None True True None + # i True True None + # None False True _payload (a list) + # i False True _payload element i (a Message) + # i False False error (not a list) + # i True False error (not a list) + # None False False _payload + # None True False _payload decoded (bytes) + # Note that Barry planned to factor out the 'decode' case, but that + # isn't so easy now that we handle the 8 bit data, which needs to be + # converted in both the decode and non-decode path. + if self.is_multipart(): + if decode: + return None + if i is None: + return self._payload + else: + return self._payload[i] + # For backward compatibility, Use isinstance and this error message + # instead of the more logical is_multipart test. + if i is not None and not isinstance(self._payload, list): raise TypeError('Expected list, got %s' % type(self._payload)) - else: - payload = self._payload[i] + payload = self._payload + cte = self.get('content-transfer-encoding', '').lower() + # payload can be bytes here, (I wonder if that is actually a bug?) + if isinstance(payload, str): + if _has_surrogates(payload): + bpayload = payload.encode('ascii', 'surrogateescape') + if not decode: + try: + payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace') + except LookupError: + payload = bpayload.decode('ascii', 'replace') + elif decode: + try: + bpayload = payload.encode('ascii') + except UnicodeError: + # This won't happen for RFC compliant messages (messages + # containing only ASCII codepoints in the unicode input). + # If it does happen, turn the string into bytes in a way + # guaranteed not to fail. + bpayload = payload.encode('raw-unicode-escape') if not decode: return payload - # Decoded payloads always return bytes. XXX split this part out into - # a new method called .get_decoded_payload(). - if self.is_multipart(): - return None - cte = self.get('content-transfer-encoding', '').lower() if cte == 'quoted-printable': - if isinstance(payload, str): - payload = payload.encode('ascii') - return utils._qdecode(payload) + return utils._qdecode(bpayload) elif cte == 'base64': try: - if isinstance(payload, str): - payload = payload.encode('ascii') - return base64.b64decode(payload) + return base64.b64decode(bpayload) except binascii.Error: # Incorrect padding - pass + return bpayload elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): - in_file = BytesIO(payload.encode('ascii')) + in_file = BytesIO(bpayload) out_file = BytesIO() try: uu.decode(in_file, out_file, quiet=True) return out_file.getvalue() except uu.Error: # Some decoding problem - pass - # Is there a better way to do this? We can't use the bytes - # constructor. + return bpayload if isinstance(payload, str): - return payload.encode('raw-unicode-escape') + return bpayload return payload def set_payload(self, payload, charset=None): @@ -340,7 +386,7 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return [v for k, v in self._headers] + return [_sanitize_surrogates(v) for k, v in self._headers] def items(self): """Get all the message's header fields and values. @@ -350,7 +396,7 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return self._headers[:] + return [(k, _sanitize_surrogates(v)) for k, v in self._headers] def get(self, name, failobj=None): """Get a header value. @@ -361,7 +407,7 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - return v + return _sanitize_surrogates(v) return failobj # @@ -381,7 +427,7 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - values.append(v) + values.append(_sanitize_surrogates(v)) if not values: return failobj return values |