summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-10-02 10:14:03 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2015-10-02 10:14:03 (GMT)
commit29e68edbf47b708c1f7082c2235d82f41e747635 (patch)
tree554941fc129b51181ea2aa7799a967a3042ebcec
parent5dbe245ef238762c8e1100885e8671bf2e089157 (diff)
parent58c8f2bb6de115b620cec3cf995f04005573765c (diff)
downloadcpython-29e68edbf47b708c1f7082c2235d82f41e747635.zip
cpython-29e68edbf47b708c1f7082c2235d82f41e747635.tar.gz
cpython-29e68edbf47b708c1f7082c2235d82f41e747635.tar.bz2
Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:
1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate. 3. In some circumstances the '\xfd' character was produced instead of the replacement character '\ufffd' (due to a bug in _PyUnicodeWriter).
-rw-r--r--Lib/test/test_codecs.py60
-rw-r--r--Lib/test/test_unicode.py3
-rw-r--r--Misc/NEWS2
-rw-r--r--Objects/unicodeobject.c21
4 files changed, 75 insertions, 11 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 4658497..bdc331e 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -926,6 +926,32 @@ class CP65001Test(ReadTest, unittest.TestCase):
class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7"
+ def test_ascii(self):
+ # Set D (directly encoded characters)
+ set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ 'abcdefghijklmnopqrstuvwxyz'
+ '0123456789'
+ '\'(),-./:?')
+ self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
+ self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
+ # Set O (optional direct characters)
+ set_o = ' !"#$%&*;<=>@[]^_`{|}'
+ self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
+ self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
+ # +
+ self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
+ self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
+ # White spaces
+ ws = ' \t\n\r'
+ self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
+ self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
+ # Other ASCII characters
+ other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
+ set(set_d + set_o + '+' + ws)))
+ self.assertEqual(other_ascii.encode(self.encoding),
+ b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
+ b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
+
def test_partial(self):
self.check_partial(
'a+-b\x00c\x80d\u0100e\U00010000f',
@@ -967,7 +993,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
def test_errors(self):
tests = [
+ (b'\xffb', '\ufffdb'),
(b'a\xffb', 'a\ufffdb'),
+ (b'a\xff\xffb', 'a\ufffd\ufffdb'),
(b'a+IK', 'a\ufffd'),
(b'a+IK-b', 'a\ufffdb'),
(b'a+IK,b', 'a\ufffdb'),
@@ -983,6 +1011,8 @@ class UTF7Test(ReadTest, unittest.TestCase):
(b'a+//,+IKw-b', 'a\ufffd\u20acb'),
(b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
(b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
+ (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
+ (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
]
for raw, expected in tests:
with self.subTest(raw=raw):
@@ -994,8 +1024,36 @@ class UTF7Test(ReadTest, unittest.TestCase):
self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
+ self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
+ self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
+ self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
+ self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
+ self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
+ b'+IKwgrNgB3KA-')
+ self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
+ '\u20ac\u20ac\U000104A0')
+ self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
+ '\u20ac\u20ac\U000104A0')
- test_lone_surrogates = None
+ def test_lone_surrogates(self):
+ tests = [
+ (b'a+2AE-b', 'a\ud801b'),
+ (b'a+2AE\xffb', 'a\ufffdb'),
+ (b'a+2AE', 'a\ufffd'),
+ (b'a+2AEA-b', 'a\ufffdb'),
+ (b'a+2AH-b', 'a\ufffdb'),
+ (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
+ (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
+ (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
+ (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
+ (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
+ (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
+ (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
+ (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
+ ]
+ for raw, expected in tests:
+ with self.subTest(raw=raw):
+ self.assertEqual(raw.decode('utf-7', 'replace'), expected)
class UTF16ExTest(unittest.TestCase):
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 3fcb590..1429a6d 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1553,7 +1553,7 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
# Issue #2242: crash on some Windows/MSVC versions
- self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
+ self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
# Direct encoded characters
set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
@@ -1995,6 +1995,7 @@ class UnicodeTest(string_tests.CommonTest,
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
+ self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
# Error handling (unknown character names)
self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
diff --git a/Misc/NEWS b/Misc/NEWS
index a9b124c..b43073f 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ Release date: XXXX-XX-XX
Core and Builtins
-----------------
+- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
+
- Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
handlers: ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
Patch co-written with Serhiy Storchaka.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 93c4ad9..4fd0430 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4360,31 +4360,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
}
else { /* now leaving a base-64 section */
inShift = 0;
- s++;
- if (surrogate) {
- if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
- goto onError;
- surrogate = 0;
- }
if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) {
/* We've seen at least one base-64 character */
+ s++;
errmsg = "partial character in shift sequence";
goto utf7Error;
}
else {
/* Some bits remain; they should be zero */
if (base64buffer != 0) {
+ s++;
errmsg = "non-zero padding bits in shift sequence";
goto utf7Error;
}
}
}
- if (ch != '-') {
+ if (surrogate && DECODE_DIRECT(ch)) {
+ if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
+ goto onError;
+ }
+ surrogate = 0;
+ if (ch == '-') {
/* '-' is absorbed; other terminating
characters are preserved */
- if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
- goto onError;
+ s++;
}
}
}
@@ -4398,6 +4398,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
}
else { /* begin base64-encoded section */
inShift = 1;
+ surrogate = 0;
shiftOutStart = writer.pos;
base64bits = 0;
base64buffer = 0;
@@ -4429,6 +4430,7 @@ utf7Error:
if (inShift && !consumed) { /* in shift sequence, no more to follow */
/* if we're in an inconsistent state, that's an error */
+ inShift = 0;
if (surrogate ||
(base64bits >= 6) ||
(base64bits > 0 && base64buffer != 0)) {
@@ -13366,6 +13368,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
if (maxchar > writer->maxchar || writer->readonly) {
/* resize + widen */
+ maxchar = Py_MAX(maxchar, writer->maxchar);
newbuffer = PyUnicode_New(newlen, maxchar);
if (newbuffer == NULL)
return -1;