summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2010-10-02 11:11:27 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2010-10-02 11:11:27 (GMT)
commit5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302 (patch)
treead617343c26a37551383247908d741ab31234842
parent1c24bd02520a647415de5c220834d7bec265a8d0 (diff)
downloadcpython-5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302.zip
cpython-5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302.tar.gz
cpython-5593d8aeb4bcc904ff58e8e3eb8b799a0aabc302.tar.bz2
Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace
UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE and 32 bits wchar_t (eg. Linux in narrow build).
-rw-r--r--Lib/test/test_unicode.py20
-rw-r--r--Misc/NEWS6
-rw-r--r--Objects/unicodeobject.c127
3 files changed, 130 insertions, 23 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 2ac79fb..285b7af 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1419,6 +1419,17 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEquals(size, 7)
self.assertEquals(wchar, 'abc\0def\0')
+ nonbmp = chr(0x10ffff)
+ if sizeof(c_wchar) == 2:
+ buflen = 3
+ nchar = 2
+ else: # sizeof(c_wchar) == 4
+ buflen = 2
+ nchar = 1
+ wchar, size = test_aswidechar(nonbmp, buflen)
+ self.assertEquals(size, nchar)
+ self.assertEquals(wchar, nonbmp + '\0')
+
# Test PyUnicode_AsWideCharString()
def test_aswidecharstring(self):
from _testcapi import test_aswidecharstring
@@ -1432,6 +1443,15 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEquals(size, 7)
self.assertEquals(wchar, 'abc\0def\0')
+ nonbmp = chr(0x10ffff)
+ if sizeof(c_wchar) == 2:
+ nchar = 2
+ else: # sizeof(c_wchar) == 4
+ nchar = 1
+ wchar, size = test_aswidecharstring(nonbmp)
+ self.assertEquals(size, nchar)
+ self.assertEquals(wchar, nonbmp + '\0')
+
def test_main():
support.run_unittest(__name__)
diff --git a/Misc/NEWS b/Misc/NEWS
index 1d943b0..efb6791 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,10 +10,14 @@ What's New in Python 3.2 Alpha 3?
Core and Builtins
-----------------
+- Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace
+ UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE
+ and 32 bits wchar_t (eg. Linux in narrow build).
+
- Issue #10006: type.__abstractmethods__ now raises an AttributeError.
- Issue #10003: Allow handling of SIGBREAK on Windows. Fixes a regression
- introduced by issue #9324.
+ introduced by issue #9324.
- Issue #9979: Create function PyUnicode_AsWideCharString().
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1c083b2..3fd22a3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1153,19 +1153,112 @@ PyUnicode_FromFormat(const char *format, ...)
return ret;
}
-static void
+/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
+ convert a Unicode object to a wide character string.
+
+ - If w is NULL: return the number of wide characters (including the nul
+ character) required to convert the unicode object. Ignore size argument.
+
+ - Otherwise: return the number of wide characters (excluding the nul
+ character) written into w. Write at most size wide characters (including
+ the nul character). */
+static Py_ssize_t
unicode_aswidechar(PyUnicodeObject *unicode,
wchar_t *w,
Py_ssize_t size)
{
#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
- memcpy(w, unicode->str, size * sizeof(wchar_t));
-#else
- register Py_UNICODE *u;
+ Py_ssize_t res;
+ if (w != NULL) {
+ res = PyUnicode_GET_SIZE(unicode);
+ if (size > res)
+ size = res + 1;
+ else
+ res = size;
+ memcpy(w, unicode->str, size * sizeof(wchar_t));
+ return res;
+ }
+ else
+ return PyUnicode_GET_SIZE(unicode) + 1;
+#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
+ register const Py_UNICODE *u;
+ const Py_UNICODE *uend;
+ const wchar_t *worig, *wend;
+ Py_ssize_t nchar;
+
+ u = PyUnicode_AS_UNICODE(unicode);
+ uend = u + PyUnicode_GET_SIZE(unicode);
+ if (w != NULL) {
+ worig = w;
+ wend = w + size;
+ while (u != uend && w != wend) {
+ if (0xD800 <= u[0] && u[0] <= 0xDBFF
+ && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
+ {
+ *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
+ u += 2;
+ }
+ else {
+ *w = *u;
+ u++;
+ }
+ w++;
+ }
+ if (w != wend)
+ *w = L'\0';
+ return w - worig;
+ }
+ else {
+ nchar = 1; /* nul character at the end */
+ while (u != uend) {
+ if (0xD800 <= u[0] && u[0] <= 0xDBFF
+ && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
+ u += 2;
+ else
+ u++;
+ nchar++;
+ }
+ }
+ return nchar;
+#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
+ register Py_UNICODE *u, *uend, ordinal;
register Py_ssize_t i;
+ wchar_t *worig, *wend;
+ Py_ssize_t nchar;
+
u = PyUnicode_AS_UNICODE(unicode);
- for (i = size; i > 0; i--)
- *w++ = *u++;
+ uend = u + PyUnicode_GET_SIZE(u);
+ if (w != NULL) {
+ worig = w;
+ wend = w + size;
+ while (u != uend && w != wend) {
+ ordinal = *u;
+ if (ordinal > 0xffff) {
+ ordinal -= 0x10000;
+ *w++ = 0xD800 | (ordinal >> 10);
+ *w++ = 0xDC00 | (ordinal & 0x3FF);
+ }
+ else
+ *w++ = ordinal;
+ u++;
+ }
+ if (w != wend)
+ *w = 0;
+ return w - worig;
+ }
+ else {
+ nchar = 1; /* nul character */
+ while (u != uend) {
+ if (*u > 0xffff)
+ nchar += 2;
+ else
+ nchar++;
+ u++;
+ }
+ return nchar;
+ }
+#else
+# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
#endif
}
@@ -1178,17 +1271,7 @@ PyUnicode_AsWideChar(PyUnicodeObject *unicode,
PyErr_BadInternalCall();
return -1;
}
-
- /* If possible, try to copy the 0-termination as well */
- if (size > PyUnicode_GET_SIZE(unicode))
- size = PyUnicode_GET_SIZE(unicode) + 1;
-
- unicode_aswidechar(unicode, w, size);
-
- if (size > PyUnicode_GET_SIZE(unicode))
- return PyUnicode_GET_SIZE(unicode);
- else
- return size;
+ return unicode_aswidechar(unicode, w, size);
}
wchar_t*
@@ -1203,20 +1286,20 @@ PyUnicode_AsWideCharString(PyUnicodeObject *unicode,
return NULL;
}
- if ((PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) < PyUnicode_GET_SIZE(unicode)) {
+ buflen = unicode_aswidechar(unicode, NULL, 0);
+ if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
PyErr_NoMemory();
return NULL;
}
- buflen = PyUnicode_GET_SIZE(unicode) + 1; /* copy L'\0' */
buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
if (buffer == NULL) {
PyErr_NoMemory();
return NULL;
}
- unicode_aswidechar(unicode, buffer, buflen);
- if (size)
- *size = buflen - 1;
+ buflen = unicode_aswidechar(unicode, buffer, buflen);
+ if (size != NULL)
+ *size = buflen;
return buffer;
}