summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAmaury Forgeot d'Arc <amauryfa@gmail.com>2008-03-23 09:55:29 (GMT)
committerAmaury Forgeot d'Arc <amauryfa@gmail.com>2008-03-23 09:55:29 (GMT)
commit9a0d3462fcac06cb257b77cad464d01d348f702c (patch)
tree268f736c3972c2f1ca590b8b5e23d5363f842c25
parent61854332b932d1d561fc58f815056d008e86ddd3 (diff)
downloadcpython-9a0d3462fcac06cb257b77cad464d01d348f702c.zip
cpython-9a0d3462fcac06cb257b77cad464d01d348f702c.tar.gz
cpython-9a0d3462fcac06cb257b77cad464d01d348f702c.tar.bz2
#1477: ur'\U0010FFFF' raised in narrow unicode builds.
Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, just like the unicode-escape codec.
-rw-r--r--Lib/test/test_unicode.py17
-rw-r--r--Misc/NEWS6
-rw-r--r--Objects/unicodeobject.c46
3 files changed, 63 insertions, 6 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index bdc7192..24e8e77 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -736,12 +736,25 @@ class UnicodeTest(
print >>out, u'def\n'
def test_ucs4(self):
- if sys.maxunicode == 0xFFFF:
- return
x = u'\U00100000'
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
self.assertEqual(x, y)
+ y = r'\U00100000'
+ x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
+ self.assertEqual(x, y)
+ y = r'\U00010000'
+ x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
+ self.assertEqual(x, y)
+
+ try:
+ '\U11111111'.decode("raw-unicode-escape")
+ except UnicodeDecodeError as e:
+ self.assertEqual(e.start, 0)
+ self.assertEqual(e.end, 10)
+ else:
+ self.fail("Should have raised UnicodeDecodeError")
+
def test_conversion(self):
# Make sure __unicode__() works properly
class Foo0:
diff --git a/Misc/NEWS b/Misc/NEWS
index 62803ff..f7b16b4 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,12 @@ What's New in Python 2.6 alpha 2?
Core and builtins
-----------------
+- Issue #1477: With narrow Unicode builds, the unicode escape sequence
+ \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane. This
+ affected raw unicode literals and the 'raw-unicode-escape' codec. Now
+ UTF-16 surrogates are generated in this case, like normal unicode literals
+ and the 'unicode-escape' codec.
+
- Issue #2348: add Py3k warning for file.softspace.
- Issue #2346/#2347: add Py3k warnings for __methods__ and __members__.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5878f96..4df9fd8 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
else
x += 10 + c - 'A';
}
-#ifndef Py_UNICODE_WIDE
- if (x > 0x10000) {
+ if (x <= 0xffff)
+ /* UCS-2 character */
+ *p++ = (Py_UNICODE) x;
+ else if (x <= 0x10ffff) {
+ /* UCS-4 character. Either store directly, or as
+ surrogate pair. */
+#ifdef Py_UNICODE_WIDE
+ *p++ = (Py_UNIC0DE) x;
+#else
+ x -= 0x10000L;
+ *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
+ *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
+#endif
+ } else {
+ endinpos = s-starts;
+ outpos = p-PyUnicode_AS_UNICODE(v);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"rawunicodeescape", "\\Uxxxxxxxx out of range",
@@ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
(PyObject **)&v, &outpos, &p))
goto onError;
}
-#endif
- *p++ = x;
nextByte:
;
}
@@ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
*p++ = hexdigit[ch & 15];
}
else
+#else
+ /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
+ if (ch >= 0xD800 && ch < 0xDC00) {
+ Py_UNICODE ch2;
+ Py_UCS4 ucs;
+
+ ch2 = *s++;
+ size--;
+ if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+ ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+ *p++ = '\\';
+ *p++ = 'U';
+ *p++ = hexdigit[(ucs >> 28) & 0xf];
+ *p++ = hexdigit[(ucs >> 24) & 0xf];
+ *p++ = hexdigit[(ucs >> 20) & 0xf];
+ *p++ = hexdigit[(ucs >> 16) & 0xf];
+ *p++ = hexdigit[(ucs >> 12) & 0xf];
+ *p++ = hexdigit[(ucs >> 8) & 0xf];
+ *p++ = hexdigit[(ucs >> 4) & 0xf];
+ *p++ = hexdigit[ucs & 0xf];
+ continue;
+ }
+ /* Fall through: isolated surrogates are copied as-is */
+ s--;
+ size++;
+ }
#endif
/* Map 16-bit characters to '\uxxxx' */
if (ch >= 256) {