summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Dickinson <dickinsm@gmail.com>2009-03-18 14:47:41 (GMT)
committerMark Dickinson <dickinsm@gmail.com>2009-03-18 14:47:41 (GMT)
commit081dfee4f154f4dfd11a3cf14516340f385049bd (patch)
tree2b4fd8b8827acc861ee7e6ecbd6f39bd3a4bdaea
parentecdfd513a2a506f70c4d5aa0f3d39b9323f91e6e (diff)
downloadcpython-081dfee4f154f4dfd11a3cf14516340f385049bd.zip
cpython-081dfee4f154f4dfd11a3cf14516340f385049bd.tar.gz
cpython-081dfee4f154f4dfd11a3cf14516340f385049bd.tar.bz2
Issue 4474: On platforms with sizeof(wchar_t) == 4 and
sizeof(Py_UNICODE) == 2, PyUnicode_FromWideChar now converts each character outside the BMP to the appropriate surrogate pair. Thanks Victor Stinner for the patch.
-rw-r--r--Misc/NEWS4
-rw-r--r--Modules/_testcapimodule.c45
-rw-r--r--Objects/unicodeobject.c64
3 files changed, 113 insertions, 0 deletions
diff --git a/Misc/NEWS b/Misc/NEWS
index e289c18..90b6091 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 2?
Core and Builtins
-----------------
+- Issue #4474: PyUnicode_FromWideChar now converts characters outside
+ the BMP to surrogate pairs, on systems with sizeof(wchar_t) == 4
+ and sizeof(Py_UNICODE) == 2.
+
- Issue #5237: Allow auto-numbered fields in str.format(). For
example: '{} {}'.format(1, 2) == '1 2'.
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index 438d5a2..4ba4898 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -708,6 +708,50 @@ test_Z_code(PyObject *self)
}
static PyObject *
+test_widechar(PyObject *self)
+{
+#if defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
+ const wchar_t wtext[2] = {(wchar_t)0x10ABCDu};
+ size_t wtextlen = 1;
+#else
+ const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu};
+ size_t wtextlen = 2;
+#endif
+ PyObject *wide, *utf8;
+
+ wide = PyUnicode_FromWideChar(wtext, wtextlen);
+ if (wide == NULL)
+ return NULL;
+
+ utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d");
+ if (utf8 == NULL) {
+ Py_DECREF(wide);
+ return NULL;
+ }
+
+ if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) {
+ Py_DECREF(wide);
+ Py_DECREF(utf8);
+ return raiseTestError("test_widechar",
+ "wide string and utf8 string "
+ "have different length");
+ }
+ if (PyUnicode_Compare(wide, utf8)) {
+ Py_DECREF(wide);
+ Py_DECREF(utf8);
+ if (PyErr_Occurred())
+ return NULL;
+ return raiseTestError("test_widechar",
+ "wide string and utf8 string "
+ "are different");
+ }
+
+ Py_DECREF(wide);
+ Py_DECREF(utf8);
+ Py_RETURN_NONE;
+}
+
+static PyObject *
test_empty_argparse(PyObject *self)
{
/* Test that formats can begin with '|'. See issue #4720. */
@@ -1206,6 +1250,7 @@ static PyMethodDef TestMethods[] = {
{"test_s_code", (PyCFunction)test_s_code, METH_NOARGS},
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
{"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS},
+ {"test_widechar", (PyCFunction)test_widechar, METH_NOARGS},
#ifdef WITH_THREAD
{"_test_thread_state", test_thread_state, METH_VARARGS},
{"_pending_threadfunc", pending_threadfunc, METH_VARARGS},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e88c8c1..03c65e3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -561,6 +561,66 @@ PyObject *PyUnicode_FromString(const char *u)
#ifdef HAVE_WCHAR_H
+#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
+# define CONVERT_WCHAR_TO_SURROGATES
+#endif
+
+#ifdef CONVERT_WCHAR_TO_SURROGATES
+
+/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
+ to convert from UTF32 to UTF16. */
+
+PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
+ Py_ssize_t size)
+{
+ PyUnicodeObject *unicode;
+ register Py_ssize_t i;
+ Py_ssize_t alloc;
+ const wchar_t *orig_w;
+
+ if (w == NULL) {
+ if (size == 0)
+ return PyUnicode_FromStringAndSize(NULL, 0);
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+
+ if (size == -1) {
+ size = wcslen(w);
+ }
+
+ alloc = size;
+ orig_w = w;
+ for (i = size; i > 0; i--) {
+ if (*w > 0xFFFF)
+ alloc++;
+ w++;
+ }
+ w = orig_w;
+ unicode = _PyUnicode_New(alloc);
+ if (!unicode)
+ return NULL;
+
+ /* Copy the wchar_t data into the new object */
+ {
+ register Py_UNICODE *u;
+ u = PyUnicode_AS_UNICODE(unicode);
+ for (i = size; i > 0; i--) {
+ if (*w > 0xFFFF) {
+ wchar_t ordinal = *w++;
+ ordinal -= 0x10000;
+ *u++ = 0xD800 | (ordinal >> 10);
+ *u++ = 0xDC00 | (ordinal & 0x3FF);
+ }
+ else
+ *u++ = *w++;
+ }
+ }
+ return (PyObject *)unicode;
+}
+
+#else
+
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Py_ssize_t size)
{
@@ -597,6 +657,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
return (PyObject *)unicode;
}
+#endif /* CONVERT_WCHAR_TO_SURROGATES */
+
+#undef CONVERT_WCHAR_TO_SURROGATES
+
static void
makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
{