From a4db68622c585e1bb526ef89f5d5ccf602906110 Mon Sep 17 00:00:00 2001 From: Amaury Forgeot d'Arc Date: Fri, 4 Jul 2008 21:26:43 +0000 Subject: Issue #3280: like chr() already does, the "%c" format now accepts the full unicode range even on "narrow Unicode" builds; the result is a pair of UTF-16 surrogates. --- Lib/test/test_unicode.py | 5 ++++- Misc/NEWS | 5 +++++ Objects/unicodeobject.c | 47 ++++++++++++++++++++++++++++++++--------------- Python/modsupport.c | 13 ++----------- 4 files changed, 43 insertions(+), 27 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 4c81205..fb904bf 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -717,7 +717,10 @@ class UnicodeTest( self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def') self.assertEqual('%c' % 0x1234, '\u1234') - self.assertRaises(OverflowError, "%c".__mod__, (sys.maxunicode+1,)) + self.assertEqual('%c' % 0x21483, '\U00021483') + self.assertRaises(OverflowError, "%c".__mod__, (0x110000,)) + self.assertEqual('%c' % '\U00021483', '\U00021483') + self.assertRaises(TypeError, "%c".__mod__, "aa") # formatting jobs delegated from the string implementation: self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') diff --git a/Misc/NEWS b/Misc/NEWS index 76e9552..4024991 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,11 @@ What's new in Python 3.0b2? Core and Builtins ----------------- +- Issue #3280: like chr(), the "%c" format now accepts unicode code points + beyond the Basic Multilingual Plane (above 0xffff) on all configurations. On + "narrow Unicode" builds, the result is a string of 2 code units, forming a + UTF-16 surrogate pair. + - Issue #3282: str.isprintable() should return False for undefined Unicode characters. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 53dbe55..9dead63 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8730,11 +8730,28 @@ formatchar(Py_UNICODE *buf, size_t buflen, PyObject *v) { - /* presume that the buffer is at least 2 characters long */ + /* presume that the buffer is at least 3 characters long */ if (PyUnicode_Check(v)) { - if (PyUnicode_GET_SIZE(v) != 1) - goto onError; - buf[0] = PyUnicode_AS_UNICODE(v)[0]; + if (PyUnicode_GET_SIZE(v) == 1) { + buf[0] = PyUnicode_AS_UNICODE(v)[0]; + buf[1] = '\0'; + return 1; + } +#ifndef Py_UNICODE_WIDE + if (PyUnicode_GET_SIZE(v) == 2) { + /* Decode a valid surrogate pair */ + int c0 = PyUnicode_AS_UNICODE(v)[0]; + int c1 = PyUnicode_AS_UNICODE(v)[1]; + if (0xD800 <= c0 && c0 <= 0xDBFF && + 0xDC00 <= c1 && c1 <= 0xDFFF) { + buf[0] = c0; + buf[1] = c1; + buf[2] = '\0'; + return 2; + } + } +#endif + goto onError; } else { /* Integer input truncated to a character */ @@ -8742,25 +8759,25 @@ formatchar(Py_UNICODE *buf, x = PyLong_AsLong(v); if (x == -1 && PyErr_Occurred()) goto onError; -#ifdef Py_UNICODE_WIDE + if (x < 0 || x > 0x10ffff) { PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x110000) " - "(wide Python build)"); + "%c arg not in range(0x110000)"); return -1; } -#else - if (x < 0 || x > 0xffff) { - PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x10000) " - "(narrow Python build)"); - return -1; + +#ifndef Py_UNICODE_WIDE + if (x > 0xffff) { + x -= 0x10000; + buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); + buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); + return 2; } #endif buf[0] = (Py_UNICODE) x; + buf[1] = '\0'; + return 1; } - buf[1] = '\0'; - return 1; onError: PyErr_SetString(PyExc_TypeError, diff --git a/Python/modsupport.c b/Python/modsupport.c index b88c1ed..e39c315 100644 --- a/Python/modsupport.c +++ b/Python/modsupport.c @@ -294,21 +294,12 @@ do_mkvalue(const char **p_format, va_list *p_va, int flags) case 'C': { int i = va_arg(*p_va, int); - Py_UNICODE c; if (i < 0 || i > PyUnicode_GetMax()) { -#ifdef Py_UNICODE_WIDE PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x110000) " - "(wide Python build)"); -#else - PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x10000) " - "(narrow Python build)"); -#endif + "%c arg not in range(0x110000)"; return NULL; } - c = i; - return PyUnicode_FromUnicode(&c, 1); + return PyUnicode_FromOrdinal(i); } case 's': -- cgit v0.12