From 28b21e50c8f1bc9f4524b02df75b83f3b5efacb4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 2 Oct 2015 13:07:28 +0300 Subject: Issue #24848: Fixed bugs in UTF-7 decoding of misformed data: 1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate. --- Lib/test/test_codecs.py | 60 +++++++++++++++++++++++++++++++++++++++++++++++- Lib/test/test_unicode.py | 3 ++- Misc/NEWS | 2 ++ Objects/unicodeobject.c | 21 +++++++++-------- 4 files changed, 75 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 8b78c24..a1079a1 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -898,6 +898,32 @@ class CP65001Test(ReadTest, unittest.TestCase): class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" + def test_ascii(self): + # Set D (directly encoded characters) + set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' + '\'(),-./:?') + self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) + self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) + # Set O (optional direct characters) + set_o = ' !"#$%&*;<=>@[]^_`{|}' + self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) + self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) + # + + self.assertEqual('a+b'.encode(self.encoding), b'a+-b') + self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') + # White spaces + ws = ' \t\n\r' + self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) + self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) + # Other ASCII characters + other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - + set(set_d + set_o + '+' + ws))) + self.assertEqual(other_ascii.encode(self.encoding), + b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' + b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') + def test_partial(self): self.check_partial( 'a+-b\x00c\x80d\u0100e\U00010000f', @@ -939,7 +965,9 @@ class UTF7Test(ReadTest, unittest.TestCase): def test_errors(self): tests = [ + (b'\xffb', '\ufffdb'), (b'a\xffb', 'a\ufffdb'), + (b'a\xff\xffb', 'a\ufffd\ufffdb'), (b'a+IK', 'a\ufffd'), (b'a+IK-b', 'a\ufffdb'), (b'a+IK,b', 'a\ufffdb'), @@ -955,6 +983,8 @@ class UTF7Test(ReadTest, unittest.TestCase): (b'a+//,+IKw-b', 'a\ufffd\u20acb'), (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), + (b'a+IKw-b\xff', 'a\u20acb\ufffd'), + (b'a+IKw\xffb', 'a\u20ac\ufffdb'), ] for raw, expected in tests: with self.subTest(raw=raw): @@ -966,8 +996,36 @@ class UTF7Test(ReadTest, unittest.TestCase): self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') + self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') + self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') + self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') + self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') + self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), + b'+IKwgrNgB3KA-') + self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), + '\u20ac\u20ac\U000104A0') + self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), + '\u20ac\u20ac\U000104A0') - test_lone_surrogates = None + def test_lone_surrogates(self): + tests = [ + (b'a+2AE-b', 'a\ud801b'), + (b'a+2AE\xffb', 'a\ufffdb'), + (b'a+2AE', 'a\ufffd'), + (b'a+2AEA-b', 'a\ufffdb'), + (b'a+2AH-b', 'a\ufffdb'), + (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), + (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), + (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), + (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), + (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), + (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), + (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), + (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), + ] + for raw, expected in tests: + with self.subTest(raw=raw): + self.assertEqual(raw.decode('utf-7', 'replace'), expected) class UTF16ExTest(unittest.TestCase): diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 5efbe3e..2cc1d7c 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1524,7 +1524,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') # Issue #2242: crash on some Windows/MSVC versions - self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') + self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '') # Direct encoded characters set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" @@ -1966,6 +1966,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') + self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x') # Error handling (unknown character names) self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") diff --git a/Misc/NEWS b/Misc/NEWS index 14fa1c2..99185d1 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ Release date: tba Core and Builtins ----------------- +- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data. + - Issue #25280: Import trace messages emitted in verbose (-v) mode are no longer formatted twice. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e28bae4..e9281ad 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4381,31 +4381,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s, } else { /* now leaving a base-64 section */ inShift = 0; - s++; - if (surrogate) { - if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) - goto onError; - surrogate = 0; - } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) { /* We've seen at least one base-64 character */ + s++; errmsg = "partial character in shift sequence"; goto utf7Error; } else { /* Some bits remain; they should be zero */ if (base64buffer != 0) { + s++; errmsg = "non-zero padding bits in shift sequence"; goto utf7Error; } } } - if (ch != '-') { + if (surrogate && DECODE_DIRECT(ch)) { + if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) + goto onError; + } + surrogate = 0; + if (ch == '-') { /* '-' is absorbed; other terminating characters are preserved */ - if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) - goto onError; + s++; } } } @@ -4419,6 +4419,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s, } else { /* begin base64-encoded section */ inShift = 1; + surrogate = 0; shiftOutStart = writer.pos; base64bits = 0; base64buffer = 0; @@ -4450,6 +4451,7 @@ utf7Error: if (inShift && !consumed) { /* in shift sequence, no more to follow */ /* if we're in an inconsistent state, that's an error */ + inShift = 0; if (surrogate || (base64bits >= 6) || (base64bits > 0 && base64buffer != 0)) { @@ -13337,6 +13339,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, if (maxchar > writer->maxchar || writer->readonly) { /* resize + widen */ + maxchar = Py_MAX(maxchar, writer->maxchar); newbuffer = PyUnicode_New(newlen, maxchar); if (newbuffer == NULL) return -1; -- cgit v0.12