From d92ae78bdbab63a68e88fb561a2bc9555d8cef6c Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Wed, 26 Jul 2006 05:54:46 +0000 Subject: Forward port some fixes that were in email 2.5 but for some reason didn't make it into email 4.0. Specifically, in Message.get_content_charset(), handle RFC 2231 headers that contain an encoding not known to Python, or a character in the data that isn't in the charset encoding. Also forward port the appropriate unit tests. --- Lib/email/message.py | 13 ++++++++++- Lib/email/test/test_email.py | 44 ++++++++++++++++++++++++++++++++++++ Lib/email/test/test_email_renamed.py | 44 ++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/Lib/email/message.py b/Lib/email/message.py index 50d90b4..79c5c4c 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -747,7 +747,18 @@ class Message: if isinstance(charset, tuple): # RFC 2231 encoded, so decode it, and it better end up as ascii. pcharset = charset[0] or 'us-ascii' - charset = unicode(charset[2], pcharset).encode('us-ascii') + try: + # LookupError will be raised if the charset isn't known to + # Python. UnicodeError will be raised if the encoded text + # contains a character not in the charset. + charset = unicode(charset[2], pcharset).encode('us-ascii') + except (LookupError, UnicodeError): + charset = charset[2] + # charset character must be in us-ascii range + try: + charset = unicode(charset, 'us-ascii').encode('us-ascii') + except UnicodeError: + return failobj # RFC 2046, $4.1.2 says charsets are not case sensitive return charset.lower() diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index db0c2be..13801dc 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -3086,6 +3086,50 @@ Content-Type: text/plain; self.assertEqual(msg.get_content_charset(), 'this is even more ***fun*** is it not.pdf') + def test_rfc2231_bad_encoding_in_filename(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0*="bogus'xx'This%20is%20even%20more%20"; +\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*2="is it not.pdf" + +''' + msg = email.message_from_string(m) + self.assertEqual(msg.get_filename(), + 'This is even more ***fun*** is it not.pdf') + + def test_rfc2231_bad_encoding_in_charset(self): + m = """\ +Content-Type: text/plain; charset*=bogus''utf-8%E2%80%9D + +""" + msg = email.message_from_string(m) + # This should return None because non-ascii characters in the charset + # are not allowed. + self.assertEqual(msg.get_content_charset(), None) + + def test_rfc2231_bad_character_in_charset(self): + m = """\ +Content-Type: text/plain; charset*=ascii''utf-8%E2%80%9D + +""" + msg = email.message_from_string(m) + # This should return None because non-ascii characters in the charset + # are not allowed. + self.assertEqual(msg.get_content_charset(), None) + + def test_rfc2231_bad_character_in_filename(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0*="ascii'xx'This%20is%20even%20more%20"; +\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*2*="is it not.pdf%E2" + +''' + msg = email.message_from_string(m) + self.assertEqual(msg.get_filename(), + u'This is even more ***fun*** is it not.pdf\ufffd') + def test_rfc2231_unknown_encoding(self): m = """\ Content-Transfer-Encoding: 8bit diff --git a/Lib/email/test/test_email_renamed.py b/Lib/email/test/test_email_renamed.py index 680a725..30f39b9 100644 --- a/Lib/email/test/test_email_renamed.py +++ b/Lib/email/test/test_email_renamed.py @@ -3092,6 +3092,50 @@ Content-Type: text/plain; self.assertEqual(msg.get_content_charset(), 'this is even more ***fun*** is it not.pdf') + def test_rfc2231_bad_encoding_in_filename(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0*="bogus'xx'This%20is%20even%20more%20"; +\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*2="is it not.pdf" + +''' + msg = email.message_from_string(m) + self.assertEqual(msg.get_filename(), + 'This is even more ***fun*** is it not.pdf') + + def test_rfc2231_bad_encoding_in_charset(self): + m = """\ +Content-Type: text/plain; charset*=bogus''utf-8%E2%80%9D + +""" + msg = email.message_from_string(m) + # This should return None because non-ascii characters in the charset + # are not allowed. + self.assertEqual(msg.get_content_charset(), None) + + def test_rfc2231_bad_character_in_charset(self): + m = """\ +Content-Type: text/plain; charset*=ascii''utf-8%E2%80%9D + +""" + msg = email.message_from_string(m) + # This should return None because non-ascii characters in the charset + # are not allowed. + self.assertEqual(msg.get_content_charset(), None) + + def test_rfc2231_bad_character_in_filename(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0*="ascii'xx'This%20is%20even%20more%20"; +\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*2*="is it not.pdf%E2" + +''' + msg = email.message_from_string(m) + self.assertEqual(msg.get_filename(), + u'This is even more ***fun*** is it not.pdf\ufffd') + def test_rfc2231_unknown_encoding(self): m = """\ Content-Transfer-Encoding: 8bit -- cgit v0.12