summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>2002-10-14 16:52:41 (GMT)
committerBarry Warsaw <barry@python.org>2002-10-14 16:52:41 (GMT)
commit67f8f2fe2a93081aaac8ddc1409df4b05daf4fab (patch)
treea3adab8513c8ffa903b2ec3a6bf735f99df09f24 /Lib
parent816aebdf94c7837e3d2a2d297886b5fbcbdc9406 (diff)
downloadcpython-67f8f2fe2a93081aaac8ddc1409df4b05daf4fab.zip
cpython-67f8f2fe2a93081aaac8ddc1409df4b05daf4fab.tar.gz
cpython-67f8f2fe2a93081aaac8ddc1409df4b05daf4fab.tar.bz2
append(): Fixing the test for convertability after consultation with
Ben. If s is a byte string, make sure it can be converted to unicode with the input codec, and from unicode with the output codec, or raise a UnicodeError exception early. Skip this test (and the unicode->byte string conversion) when the charset is our faux 8bit raw charset.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/email/Header.py42
1 files changed, 28 insertions, 14 deletions
diff --git a/Lib/email/Header.py b/Lib/email/Header.py
index 9bbc32f..0ceacc7 100644
--- a/Lib/email/Header.py
+++ b/Lib/email/Header.py
@@ -218,20 +218,34 @@ class Header:
charset = self._charset
elif not isinstance(charset, Charset):
charset = Charset(charset)
- # Normalize and check the string
- if isinstance(s, StringType):
- # Possibly raise UnicodeError if it can't be encoded
- unicode(s, charset.get_output_charset())
- elif isinstance(s, UnicodeType):
- # Convert Unicode to byte string for later concatenation
- for charset in USASCII, charset, UTF8:
- try:
- s = s.encode(charset.get_output_charset())
- break
- except UnicodeError:
- pass
- else:
- assert False, 'Could not encode to utf-8'
+ # If the charset is our faux 8bit charset, leave the string unchanged
+ if charset <> '8bit':
+ # We need to test that the string can be converted to unicode and
+ # back to a byte string, given the input and output codecs of the
+ # charset.
+ if isinstance(s, StringType):
+ # Possibly raise UnicodeError if the byte string can't be
+ # converted to a unicode with the input codec of the charset.
+ incodec = charset.input_codec or 'us-ascii'
+ ustr = unicode(s, incodec)
+ # Now make sure that the unicode could be converted back to a
+ # byte string with the output codec, which may be different
+ # than the iput coded. Still, use the original byte string.
+ outcodec = charset.output_codec or 'us-ascii'
+ ustr.encode(outcodec)
+ elif isinstance(s, UnicodeType):
+ # Now we have to be sure the unicode string can be converted
+ # to a byte string with a reasonable output codec. We want to
+ # use the byte string in the chunk.
+ for charset in USASCII, charset, UTF8:
+ try:
+ outcodec = charset.output_codec or 'us-ascii'
+ s = s.encode(outcodec)
+ break
+ except UnicodeError:
+ pass
+ else:
+ assert False, 'utf-8 conversion failed'
self._chunks.append((s, charset))
def _split(self, s, charset, firstline=False):