summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2011-11-15 00:42:21 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2011-11-15 00:42:21 (GMT)
commit5418ee0b9a36886064937159f9c0641ae2c4f618 (patch)
tree5602bfce8d5bea5b17deee14207d2a238170489a
parentc2fe57762b6cfa8849908e1a0475036cd0b058ba (diff)
downloadcpython-5418ee0b9a36886064937159f9c0641ae2c4f618.zip
cpython-5418ee0b9a36886064937159f9c0641ae2c4f618.tar.gz
cpython-5418ee0b9a36886064937159f9c0641ae2c4f618.tar.bz2
Issue #13333: The UTF-7 decoder now accepts lone surrogates
(the encoder already accepts them).
-rw-r--r--Lib/test/test_unicode.py16
-rw-r--r--Misc/NEWS3
-rw-r--r--Objects/unicodeobject.c14
3 files changed, 20 insertions, 13 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 86185e9..591a297 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1091,10 +1091,18 @@ class UnicodeTest(string_tests.CommonTest,
for (x, y) in utfTests:
self.assertEqual(x.encode('utf-7'), y)
- # Unpaired surrogates not supported
- self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
-
- self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd')
+ # Unpaired surrogates are passed through
+ self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
+ self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
+ self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
+ self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
+ self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
+ self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
+ self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
+ self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
+
+ self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
+ self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
# Issue #2242: crash on some Windows/MSVC versions
self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
diff --git a/Misc/NEWS b/Misc/NEWS
index ca8d4cb..4fb9ff6 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ What's New in Python 3.2.3?
Core and Builtins
-----------------
+- Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder
+ already accepts them).
+
- Issue #13342: input() used to ignore sys.stdin's and sys.stdout's unicode
error handler in interactive mode (when calling into PyOS_Readline()).
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7316abf..8680726 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2282,21 +2282,17 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
*p++ = outCh;
#endif
surrogate = 0;
+ continue;
}
else {
+ *p++ = surrogate;
surrogate = 0;
- errmsg = "second surrogate missing";
- goto utf7Error;
}
}
- else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+ if (outCh >= 0xD800 && outCh <= 0xDBFF) {
/* first surrogate */
surrogate = outCh;
}
- else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
- errmsg = "unexpected second surrogate";
- goto utf7Error;
- }
else {
*p++ = outCh;
}
@@ -2306,8 +2302,8 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
inShift = 0;
s++;
if (surrogate) {
- errmsg = "second surrogate missing at end of shift sequence";
- goto utf7Error;
+ *p++ = surrogate;
+ surrogate = 0;
}
if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) {