From 8ac004e69895e8fd525307fdc1e093f92b15ce09 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sun, 15 Jul 2007 13:00:05 +0000 Subject: Make chr() and ord() return/accept surrogate pairs in narrow builds. The domain of chr() and the range of ord() are now always [0 ... 0x10FFFF]. --- Lib/test/test_builtin.py | 35 +++++++++++++++++++++++++++-------- Objects/unicodeobject.c | 19 +++++++++---------- Python/bltinmodule.c | 25 +++++++++++++++++++++++-- 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index 036a9f2..7e37c29 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -169,15 +169,23 @@ class BuiltinTest(unittest.TestCase): self.assertEqual(chr(97), 'a') self.assertEqual(chr(0xff), '\xff') self.assertRaises(ValueError, chr, 1<<24) - self.assertEqual( - chr(sys.maxunicode), - str(('\\U%08x' % (sys.maxunicode)).encode("ascii"), 'unicode-escape') - ) - self.assertRaises(ValueError, chr, sys.maxunicode+1) + self.assertEqual(chr(sys.maxunicode), + str(('\\U%08x' % (sys.maxunicode)).encode("ascii"), + 'unicode-escape')) self.assertRaises(TypeError, chr) - - def XXX_test_cmp(self): - # cmp() is no longer supported + self.assertEqual(chr(0x0000FFFF), "\U0000FFFF") + self.assertEqual(chr(0x00010000), "\U00010000") + self.assertEqual(chr(0x00010001), "\U00010001") + self.assertEqual(chr(0x000FFFFE), "\U000FFFFE") + self.assertEqual(chr(0x000FFFFF), "\U000FFFFF") + self.assertEqual(chr(0x00100000), "\U00100000") + self.assertEqual(chr(0x00100001), "\U00100001") + self.assertEqual(chr(0x0010FFFE), "\U0010FFFE") + self.assertEqual(chr(0x0010FFFF), "\U0010FFFF") + self.assertRaises(ValueError, chr, -1) + self.assertRaises(ValueError, chr, 0x00110000) + + def test_cmp(self): self.assertEqual(cmp(-1, 1), -1) self.assertEqual(cmp(1, -1), 1) self.assertEqual(cmp(1, 1), 0) @@ -1288,6 +1296,17 @@ class BuiltinTest(unittest.TestCase): self.assertEqual(ord(chr(sys.maxunicode)), sys.maxunicode) self.assertRaises(TypeError, ord, 42) + self.assertEqual(ord(chr(0x10FFFF)), 0x10FFFF) + self.assertEqual(ord("\U0000FFFF"), 0x0000FFFF) + self.assertEqual(ord("\U00010000"), 0x00010000) + self.assertEqual(ord("\U00010001"), 0x00010001) + self.assertEqual(ord("\U000FFFFE"), 0x000FFFFE) + self.assertEqual(ord("\U000FFFFF"), 0x000FFFFF) + self.assertEqual(ord("\U00100000"), 0x00100000) + self.assertEqual(ord("\U00100001"), 0x00100001) + self.assertEqual(ord("\U0010FFFE"), 0x0010FFFE) + self.assertEqual(ord("\U0010FFFF"), 0x0010FFFF) + def test_pow(self): self.assertEqual(pow(0,0), 1) self.assertEqual(pow(0,1), 0) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2728f1f..a60fa8b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -915,21 +915,20 @@ Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, PyObject *PyUnicode_FromOrdinal(int ordinal) { - Py_UNICODE s[1]; + Py_UNICODE s[2]; -#ifdef Py_UNICODE_WIDE if (ordinal < 0 || ordinal > 0x10ffff) { PyErr_SetString(PyExc_ValueError, - "chr() arg not in range(0x110000) " - "(wide Python build)"); + "chr() arg not in range(0x110000)"); return NULL; } -#else - if (ordinal < 0 || ordinal > 0xffff) { - PyErr_SetString(PyExc_ValueError, - "chr() arg not in range(0x10000) " - "(narrow Python build)"); - return NULL; + +#ifndef Py_UNICODE_WIDE + if (ordinal > 0xffff) { + ordinal -= 0x10000; + s[0] = 0xD800 | (ordinal >> 10); + s[1] = 0xDC00 | (ordinal & 0x3FF); + return PyUnicode_FromUnicode(s, 2); } #endif diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c index db9ac2c..08c1a00 100644 --- a/Python/bltinmodule.c +++ b/Python/bltinmodule.c @@ -317,7 +317,11 @@ builtin_chr(PyObject *self, PyObject *args) PyDoc_STRVAR(chr_doc, "chr(i) -> Unicode character\n\ \n\ -Return a Unicode string of one character with ordinal i; 0 <= i <= 0x10ffff."); +Return a Unicode string of one character with ordinal i; 0 <= i <= 0x10ffff." +#ifndef Py_UNICODE_WIDE +"\nIf 0x10000 <= i, a surrogate pair is returned." +#endif +); static PyObject * @@ -1179,6 +1183,19 @@ builtin_ord(PyObject *self, PyObject* obj) ord = (long)*PyUnicode_AS_UNICODE(obj); return PyInt_FromLong(ord); } +#ifndef Py_UNICODE_WIDE + if (size == 2) { + /* Decode a valid surrogate pair */ + int c0 = PyUnicode_AS_UNICODE(obj)[0]; + int c1 = PyUnicode_AS_UNICODE(obj)[1]; + if (0xD800 <= c0 && c0 <= 0xDBFF && + 0xDC00 <= c1 && c1 <= 0xDFFF) { + ord = ((((c0 & 0x03FF) << 10) | (c1 & 0x03FF)) + + 0x00010000); + return PyInt_FromLong(ord); + } + } +#endif } else if (PyBytes_Check(obj)) { /* XXX Hopefully this is temporary */ @@ -1205,7 +1222,11 @@ builtin_ord(PyObject *self, PyObject* obj) PyDoc_STRVAR(ord_doc, "ord(c) -> integer\n\ \n\ -Return the integer ordinal of a one-character string."); +Return the integer ordinal of a one-character string." +#ifndef Py_UNICODE_WIDE +"\nA valid surrogate pair is also accepted." +#endif +); static PyObject * -- cgit v0.12