summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2018-10-23 19:58:24 (GMT)
committerGitHub <noreply@github.com>2018-10-23 19:58:24 (GMT)
commitc46db9232f1a6e0e3c33053549d03d4335db9dca (patch)
tree72098815b2ebf289beedcfad8d7f2246c507b6bf /Objects/unicodeobject.c
parentdf13df41a25765d8a39a77220691698498e758d4 (diff)
downloadcpython-c46db9232f1a6e0e3c33053549d03d4335db9dca.zip
cpython-c46db9232f1a6e0e3c33053549d03d4335db9dca.tar.gz
cpython-c46db9232f1a6e0e3c33053549d03d4335db9dca.tar.bz2
bpo-30863: Rewrite PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). (GH-2599)
They no longer cache the wchar_t* representation of string objects.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c244
1 files changed, 121 insertions, 123 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index db9b25e..31703d3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2921,6 +2921,83 @@ PyUnicode_FromFormat(const char *format, ...)
return ret;
}
+static Py_ssize_t
+unicode_get_widechar_size(PyObject *unicode)
+{
+ Py_ssize_t res;
+
+ assert(unicode != NULL);
+ assert(_PyUnicode_CHECK(unicode));
+
+ if (_PyUnicode_WSTR(unicode) != NULL) {
+ return PyUnicode_WSTR_LENGTH(unicode);
+ }
+ assert(PyUnicode_IS_READY(unicode));
+
+ res = _PyUnicode_LENGTH(unicode);
+#if SIZEOF_WCHAR_T == 2
+ if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
+ const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
+ const Py_UCS4 *end = s + res;
+ for (; s < end; ++s) {
+ if (*s > 0xFFFF) {
+ ++res;
+ }
+ }
+ }
+#endif
+ return res;
+}
+
+static void
+unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
+{
+ const wchar_t *wstr;
+
+ assert(unicode != NULL);
+ assert(_PyUnicode_CHECK(unicode));
+
+ wstr = _PyUnicode_WSTR(unicode);
+ if (wstr != NULL) {
+ memcpy(w, wstr, size * sizeof(wchar_t));
+ return;
+ }
+ assert(PyUnicode_IS_READY(unicode));
+
+ if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
+ const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
+ for (; size--; ++s, ++w) {
+ *w = *s;
+ }
+ }
+ else {
+#if SIZEOF_WCHAR_T == 4
+ assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
+ const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
+ for (; size--; ++s, ++w) {
+ *w = *s;
+ }
+#else
+ assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
+ const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
+ for (; size--; ++s, ++w) {
+ Py_UCS4 ch = *s;
+ if (ch > 0xFFFF) {
+ assert(ch <= MAX_UNICODE);
+ /* encode surrogate pair in this case */
+ *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
+ if (!size--)
+ break;
+ *w = Py_UNICODE_LOW_SURROGATE(ch);
+ }
+ else {
+ *w = ch;
+ }
+ }
+#endif
+ }
+}
+
#ifdef HAVE_WCHAR_H
/* Convert a Unicode object to a wide character string.
@@ -2937,33 +3014,35 @@ PyUnicode_AsWideChar(PyObject *unicode,
Py_ssize_t size)
{
Py_ssize_t res;
- const wchar_t *wstr;
if (unicode == NULL) {
PyErr_BadInternalCall();
return -1;
}
- wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
- if (wstr == NULL)
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
return -1;
-
- if (w != NULL) {
- if (size > res)
- size = res + 1;
- else
- res = size;
- memcpy(w, wstr, size * sizeof(wchar_t));
- return res;
}
- else
+
+ res = unicode_get_widechar_size(unicode);
+ if (w == NULL) {
return res + 1;
+ }
+
+ if (size > res) {
+ size = res + 1;
+ }
+ else {
+ res = size;
+ }
+ unicode_copy_as_widechar(unicode, w, size);
+ return res;
}
wchar_t*
PyUnicode_AsWideCharString(PyObject *unicode,
Py_ssize_t *size)
{
- const wchar_t *wstr;
wchar_t *buffer;
Py_ssize_t buflen;
@@ -2971,25 +3050,27 @@ PyUnicode_AsWideCharString(PyObject *unicode,
PyErr_BadInternalCall();
return NULL;
}
-
- wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
- if (wstr == NULL) {
- return NULL;
- }
- if (size == NULL && wcslen(wstr) != (size_t)buflen) {
- PyErr_SetString(PyExc_ValueError,
- "embedded null character");
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
return NULL;
}
- buffer = PyMem_NEW(wchar_t, buflen + 1);
+ buflen = unicode_get_widechar_size(unicode);
+ buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
if (buffer == NULL) {
PyErr_NoMemory();
return NULL;
}
- memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
- if (size != NULL)
+ unicode_copy_as_widechar(unicode, buffer, buflen + 1);
+ if (size != NULL) {
*size = buflen;
+ }
+ else if (wcslen(buffer) != (size_t)buflen) {
+ PyMem_FREE(buffer);
+ PyErr_SetString(PyExc_ValueError,
+ "embedded null character");
+ return NULL;
+ }
return buffer;
}
@@ -3781,118 +3862,35 @@ PyUnicode_AsUTF8(PyObject *unicode)
Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
{
- const unsigned char *one_byte;
-#if SIZEOF_WCHAR_T == 4
- const Py_UCS2 *two_bytes;
-#else
- const Py_UCS4 *four_bytes;
- const Py_UCS4 *ucs4_end;
- Py_ssize_t num_surrogates;
-#endif
- wchar_t *w;
- wchar_t *wchar_end;
-
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
- if (_PyUnicode_WSTR(unicode) == NULL) {
+ Py_UNICODE *w = _PyUnicode_WSTR(unicode);
+ if (w == NULL) {
/* Non-ASCII compact unicode object */
- assert(_PyUnicode_KIND(unicode) != 0);
+ assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
assert(PyUnicode_IS_READY(unicode));
- if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
-#if SIZEOF_WCHAR_T == 2
- four_bytes = PyUnicode_4BYTE_DATA(unicode);
- ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
- num_surrogates = 0;
-
- for (; four_bytes < ucs4_end; ++four_bytes) {
- if (*four_bytes > 0xFFFF)
- ++num_surrogates;
- }
-
- _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
- sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
- if (!_PyUnicode_WSTR(unicode)) {
- PyErr_NoMemory();
- return NULL;
- }
- _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
-
- w = _PyUnicode_WSTR(unicode);
- wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
- four_bytes = PyUnicode_4BYTE_DATA(unicode);
- for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
- if (*four_bytes > 0xFFFF) {
- assert(*four_bytes <= MAX_UNICODE);
- /* encode surrogate pair in this case */
- *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
- *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
- }
- else
- *w = *four_bytes;
-
- if (w > wchar_end) {
- Py_UNREACHABLE();
- }
- }
- *w = 0;
-#else
- /* sizeof(wchar_t) == 4 */
- Py_FatalError("Impossible unicode object state, wstr and str "
- "should share memory already.");
+ Py_ssize_t wlen = unicode_get_widechar_size(unicode);
+ if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
+ PyErr_NoMemory();
return NULL;
-#endif
}
- else {
- if ((size_t)_PyUnicode_LENGTH(unicode) >
- PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
- PyErr_NoMemory();
- return NULL;
- }
- _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
- (_PyUnicode_LENGTH(unicode) + 1));
- if (!_PyUnicode_WSTR(unicode)) {
- PyErr_NoMemory();
- return NULL;
- }
- if (!PyUnicode_IS_COMPACT_ASCII(unicode))
- _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
- w = _PyUnicode_WSTR(unicode);
- wchar_end = w + _PyUnicode_LENGTH(unicode);
-
- if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
- one_byte = PyUnicode_1BYTE_DATA(unicode);
- for (; w < wchar_end; ++one_byte, ++w)
- *w = *one_byte;
- /* null-terminate the wstr */
- *w = 0;
- }
- else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
-#if SIZEOF_WCHAR_T == 4
- two_bytes = PyUnicode_2BYTE_DATA(unicode);
- for (; w < wchar_end; ++two_bytes, ++w)
- *w = *two_bytes;
- /* null-terminate the wstr */
- *w = 0;
-#else
- /* sizeof(wchar_t) == 2 */
- PyObject_FREE(_PyUnicode_WSTR(unicode));
- _PyUnicode_WSTR(unicode) = NULL;
- Py_FatalError("Impossible unicode object state, wstr "
- "and str should share memory already.");
- return NULL;
-#endif
- }
- else {
- Py_UNREACHABLE();
- }
+ w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
+ if (w == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ unicode_copy_as_widechar(unicode, w, wlen + 1);
+ _PyUnicode_WSTR(unicode) = w;
+ if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
+ _PyUnicode_WSTR_LENGTH(unicode) = wlen;
}
}
if (size != NULL)
*size = PyUnicode_WSTR_LENGTH(unicode);
- return _PyUnicode_WSTR(unicode);
+ return w;
}
Py_UNICODE *