diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2011-11-15 00:42:21 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2011-11-15 00:42:21 (GMT) |
commit | 5418ee0b9a36886064937159f9c0641ae2c4f618 (patch) | |
tree | 5602bfce8d5bea5b17deee14207d2a238170489a | |
parent | c2fe57762b6cfa8849908e1a0475036cd0b058ba (diff) | |
download | cpython-5418ee0b9a36886064937159f9c0641ae2c4f618.zip cpython-5418ee0b9a36886064937159f9c0641ae2c4f618.tar.gz cpython-5418ee0b9a36886064937159f9c0641ae2c4f618.tar.bz2 |
Issue #13333: The UTF-7 decoder now accepts lone surrogates
(the encoder already accepts them).
-rw-r--r-- | Lib/test/test_unicode.py | 16 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 14 |
3 files changed, 20 insertions, 13 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 86185e9..591a297 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1091,10 +1091,18 @@ class UnicodeTest(string_tests.CommonTest, for (x, y) in utfTests: self.assertEqual(x.encode('utf-7'), y) - # Unpaired surrogates not supported - self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7') - - self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd') + # Unpaired surrogates are passed through + self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-') + self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x') + self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-') + self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x') + self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801') + self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x') + self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01') + self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x') + + self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-') + self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') # Issue #2242: crash on some Windows/MSVC versions self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') @@ -10,6 +10,9 @@ What's New in Python 3.2.3? Core and Builtins ----------------- +- Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder + already accepts them). + - Issue #13342: input() used to ignore sys.stdin's and sys.stdout's unicode error handler in interactive mode (when calling into PyOS_Readline()). diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7316abf..8680726 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2282,21 +2282,17 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, *p++ = outCh; #endif surrogate = 0; + continue; } else { + *p++ = surrogate; surrogate = 0; - errmsg = "second surrogate missing"; - goto utf7Error; } } - else if (outCh >= 0xD800 && outCh <= 0xDBFF) { + if (outCh >= 0xD800 && outCh <= 0xDBFF) { /* first surrogate */ surrogate = outCh; } - else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { - errmsg = "unexpected second surrogate"; - goto utf7Error; - } else { *p++ = outCh; } @@ -2306,8 +2302,8 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, inShift = 0; s++; if (surrogate) { - errmsg = "second surrogate missing at end of shift sequence"; - goto utf7Error; + *p++ = surrogate; + surrogate = 0; } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) { |