diff options
author | Victor Stinner <victor.stinner@haypocalc.com> | 2010-10-02 11:11:27 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@haypocalc.com> | 2010-10-02 11:11:27 (GMT) |
commit | 5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302 (patch) | |
tree | ad617343c26a37551383247908d741ab31234842 | |
parent | 1c24bd02520a647415de5c220834d7bec265a8d0 (diff) | |
download | cpython-5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302.zip cpython-5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302.tar.gz cpython-5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302.tar.bz2 |
Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace
UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE
and 32 bits wchar_t (eg. Linux in narrow build).
-rw-r--r-- | Lib/test/test_unicode.py | 20 | ||||
-rw-r--r-- | Misc/NEWS | 6 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 127 |
3 files changed, 130 insertions, 23 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 2ac79fb..285b7af 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1419,6 +1419,17 @@ class UnicodeTest(string_tests.CommonTest, self.assertEquals(size, 7) self.assertEquals(wchar, 'abc\0def\0') + nonbmp = chr(0x10ffff) + if sizeof(c_wchar) == 2: + buflen = 3 + nchar = 2 + else: # sizeof(c_wchar) == 4 + buflen = 2 + nchar = 1 + wchar, size = test_aswidechar(nonbmp, buflen) + self.assertEquals(size, nchar) + self.assertEquals(wchar, nonbmp + '\0') + # Test PyUnicode_AsWideCharString() def test_aswidecharstring(self): from _testcapi import test_aswidecharstring @@ -1432,6 +1443,15 @@ class UnicodeTest(string_tests.CommonTest, self.assertEquals(size, 7) self.assertEquals(wchar, 'abc\0def\0') + nonbmp = chr(0x10ffff) + if sizeof(c_wchar) == 2: + nchar = 2 + else: # sizeof(c_wchar) == 4 + nchar = 1 + wchar, size = test_aswidecharstring(nonbmp) + self.assertEquals(size, nchar) + self.assertEquals(wchar, nonbmp + '\0') + def test_main(): support.run_unittest(__name__) @@ -10,10 +10,14 @@ What's New in Python 3.2 Alpha 3? Core and Builtins ----------------- +- Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace + UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE + and 32 bits wchar_t (eg. Linux in narrow build). + - Issue #10006: type.__abstractmethods__ now raises an AttributeError. - Issue #10003: Allow handling of SIGBREAK on Windows. Fixes a regression - introduced by issue #9324. + introduced by issue #9324. - Issue #9979: Create function PyUnicode_AsWideCharString(). diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1c083b2..3fd22a3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1153,19 +1153,112 @@ PyUnicode_FromFormat(const char *format, ...) return ret; } -static void +/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): + convert a Unicode object to a wide character string. + + - If w is NULL: return the number of wide characters (including the nul + character) required to convert the unicode object. Ignore size argument. + + - Otherwise: return the number of wide characters (excluding the nul + character) written into w. Write at most size wide characters (including + the nul character). */ +static Py_ssize_t unicode_aswidechar(PyUnicodeObject *unicode, wchar_t *w, Py_ssize_t size) { #if Py_UNICODE_SIZE == SIZEOF_WCHAR_T - memcpy(w, unicode->str, size * sizeof(wchar_t)); -#else - register Py_UNICODE *u; + Py_ssize_t res; + if (w != NULL) { + res = PyUnicode_GET_SIZE(unicode); + if (size > res) + size = res + 1; + else + res = size; + memcpy(w, unicode->str, size * sizeof(wchar_t)); + return res; + } + else + return PyUnicode_GET_SIZE(unicode) + 1; +#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 + register const Py_UNICODE *u; + const Py_UNICODE *uend; + const wchar_t *worig, *wend; + Py_ssize_t nchar; + + u = PyUnicode_AS_UNICODE(unicode); + uend = u + PyUnicode_GET_SIZE(unicode); + if (w != NULL) { + worig = w; + wend = w + size; + while (u != uend && w != wend) { + if (0xD800 <= u[0] && u[0] <= 0xDBFF + && 0xDC00 <= u[1] && u[1] <= 0xDFFF) + { + *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; + u += 2; + } + else { + *w = *u; + u++; + } + w++; + } + if (w != wend) + *w = L'\0'; + return w - worig; + } + else { + nchar = 1; /* nul character at the end */ + while (u != uend) { + if (0xD800 <= u[0] && u[0] <= 0xDBFF + && 0xDC00 <= u[1] && u[1] <= 0xDFFF) + u += 2; + else + u++; + nchar++; + } + } + return nchar; +#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 + register Py_UNICODE *u, *uend, ordinal; register Py_ssize_t i; + wchar_t *worig, *wend; + Py_ssize_t nchar; + u = PyUnicode_AS_UNICODE(unicode); - for (i = size; i > 0; i--) - *w++ = *u++; + uend = u + PyUnicode_GET_SIZE(u); + if (w != NULL) { + worig = w; + wend = w + size; + while (u != uend && w != wend) { + ordinal = *u; + if (ordinal > 0xffff) { + ordinal -= 0x10000; + *w++ = 0xD800 | (ordinal >> 10); + *w++ = 0xDC00 | (ordinal & 0x3FF); + } + else + *w++ = ordinal; + u++; + } + if (w != wend) + *w = 0; + return w - worig; + } + else { + nchar = 1; /* nul character */ + while (u != uend) { + if (*u > 0xffff) + nchar += 2; + else + nchar++; + u++; + } + return nchar; + } +#else +# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" #endif } @@ -1178,17 +1271,7 @@ PyUnicode_AsWideChar(PyUnicodeObject *unicode, PyErr_BadInternalCall(); return -1; } - - /* If possible, try to copy the 0-termination as well */ - if (size > PyUnicode_GET_SIZE(unicode)) - size = PyUnicode_GET_SIZE(unicode) + 1; - - unicode_aswidechar(unicode, w, size); - - if (size > PyUnicode_GET_SIZE(unicode)) - return PyUnicode_GET_SIZE(unicode); - else - return size; + return unicode_aswidechar(unicode, w, size); } wchar_t* @@ -1203,20 +1286,20 @@ PyUnicode_AsWideCharString(PyUnicodeObject *unicode, return NULL; } - if ((PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) < PyUnicode_GET_SIZE(unicode)) { + buflen = unicode_aswidechar(unicode, NULL, 0); + if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { PyErr_NoMemory(); return NULL; } - buflen = PyUnicode_GET_SIZE(unicode) + 1; /* copy L'\0' */ buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); if (buffer == NULL) { PyErr_NoMemory(); return NULL; } - unicode_aswidechar(unicode, buffer, buflen); - if (size) - *size = buflen - 1; + buflen = unicode_aswidechar(unicode, buffer, buflen); + if (size != NULL) + *size = buflen; return buffer; } |