diff options
author | Barry Warsaw <barry@python.org> | 2002-10-14 16:52:41 (GMT) |
---|---|---|
committer | Barry Warsaw <barry@python.org> | 2002-10-14 16:52:41 (GMT) |
commit | 67f8f2fe2a93081aaac8ddc1409df4b05daf4fab (patch) | |
tree | a3adab8513c8ffa903b2ec3a6bf735f99df09f24 /Lib | |
parent | 816aebdf94c7837e3d2a2d297886b5fbcbdc9406 (diff) | |
download | cpython-67f8f2fe2a93081aaac8ddc1409df4b05daf4fab.zip cpython-67f8f2fe2a93081aaac8ddc1409df4b05daf4fab.tar.gz cpython-67f8f2fe2a93081aaac8ddc1409df4b05daf4fab.tar.bz2 |
append(): Fixing the test for convertability after consultation with
Ben. If s is a byte string, make sure it can be converted to unicode
with the input codec, and from unicode with the output codec, or raise
a UnicodeError exception early. Skip this test (and the unicode->byte
string conversion) when the charset is our faux 8bit raw charset.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/email/Header.py | 42 |
1 files changed, 28 insertions, 14 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py index 9bbc32f..0ceacc7 100644 --- a/Lib/email/Header.py +++ b/Lib/email/Header.py @@ -218,20 +218,34 @@ class Header: charset = self._charset elif not isinstance(charset, Charset): charset = Charset(charset) - # Normalize and check the string - if isinstance(s, StringType): - # Possibly raise UnicodeError if it can't be encoded - unicode(s, charset.get_output_charset()) - elif isinstance(s, UnicodeType): - # Convert Unicode to byte string for later concatenation - for charset in USASCII, charset, UTF8: - try: - s = s.encode(charset.get_output_charset()) - break - except UnicodeError: - pass - else: - assert False, 'Could not encode to utf-8' + # If the charset is our faux 8bit charset, leave the string unchanged + if charset <> '8bit': + # We need to test that the string can be converted to unicode and + # back to a byte string, given the input and output codecs of the + # charset. + if isinstance(s, StringType): + # Possibly raise UnicodeError if the byte string can't be + # converted to a unicode with the input codec of the charset. + incodec = charset.input_codec or 'us-ascii' + ustr = unicode(s, incodec) + # Now make sure that the unicode could be converted back to a + # byte string with the output codec, which may be different + # than the iput coded. Still, use the original byte string. + outcodec = charset.output_codec or 'us-ascii' + ustr.encode(outcodec) + elif isinstance(s, UnicodeType): + # Now we have to be sure the unicode string can be converted + # to a byte string with a reasonable output codec. We want to + # use the byte string in the chunk. + for charset in USASCII, charset, UTF8: + try: + outcodec = charset.output_codec or 'us-ascii' + s = s.encode(outcodec) + break + except UnicodeError: + pass + else: + assert False, 'utf-8 conversion failed' self._chunks.append((s, charset)) def _split(self, s, charset, firstline=False): |