From 4123226bbda437b64b60c9111d3e4cc9dc76f7d5 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 21 Jun 2024 19:33:15 +0200 Subject: gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() (#120639) Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions. Co-authored-by: Serhiy Storchaka --- Doc/c-api/unicode.rst | 35 ++++++- Doc/whatsnew/3.14.rst | 2 + Include/cpython/unicodeobject.h | 10 ++ Modules/_testcapi/unicode.c | 152 ++++++++++++++++++++++++++++++ Objects/unicodeobject.c | 201 +++++++++++++++++++++++++++------------- 5 files changed, 332 insertions(+), 68 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 02e696c..4ea20bd 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1551,9 +1551,17 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. - To use a different error handler than ``strict``, - :c:func:`PyUnicode_DecodeUTF8` can be used with - :c:func:`PyUnicodeWriter_WriteStr`. + See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. + +.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size) + + Writer the wide string *str* into *writer*. + + *size* is a number of wide characters. If *size* is equal to ``-1``, call + ``wcslen(str)`` to get the string length. + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) @@ -1586,3 +1594,24 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. + +.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed) + + Decode the string *str* from UTF-8 with *errors* error handler and write the + output into *writer*. + + *size* is the string length in bytes. If *size* is equal to ``-1``, call + ``strlen(str)`` to get the string length. + + *errors* is an error handler name, such as ``"replace"``. If *errors* is + ``NULL``, use the strict error handler. + + If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded + bytes on success. + If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences + as an error. + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. + + See also :c:func:`PyUnicodeWriter_WriteUTF8`. diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 804d39a..2eefa23 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -291,10 +291,12 @@ New Features * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. + * :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. * :c:func:`PyUnicodeWriter_WriteSubstring`. * :c:func:`PyUnicodeWriter_Format`. + * :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. (Contributed by Victor Stinner in :gh:`119182`.) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index e5e1b6b..059bec8 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( PyUnicodeWriter *writer, const char *str, Py_ssize_t size); +PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( + PyUnicodeWriter *writer, + const wchar_t *str, + Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( PyUnicodeWriter *writer, @@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format( PyUnicodeWriter *writer, const char *format, ...); +PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful( + PyUnicodeWriter *writer, + const char *string, /* UTF-8 encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed); /* bytes consumed */ /* --- Private _PyUnicodeWriter API --------------------------------------- */ diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 79f99c4..da658b4 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -375,6 +375,119 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args)) static PyObject * +test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // test PyUnicodeWriter_DecodeUTF8Stateful() + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + + // incomplete trailing UTF-8 sequence + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) { + goto error; + } + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, + "ignore-replace\xef\xbf\xbd" + "-incomplete\xef\xbf\xbd")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + +static PyObject * +test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // test PyUnicodeWriter_DecodeUTF8Stateful() + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + Py_ssize_t consumed; + + // valid string + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) { + goto error; + } + assert(consumed == 4); + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + + // non-ASCII + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) { + goto error; + } + assert(consumed == 6); + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + + // consumed is 0 if write fails + consumed = 12345; + assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0); + PyErr_Clear(); + assert(consumed == 0); + + // ignore error handler + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) { + goto error; + } + assert(consumed == 5); + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + + // incomplete trailing UTF-8 sequence + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) { + goto error; + } + assert(consumed == 10); + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, + "text-\xC3\xA9-\xE2\x82\xAC-" + "more-incomplete")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + +static PyObject * test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args)) { PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); @@ -436,6 +549,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args } +static PyObject * +test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '.') < 0) { + goto error; + } + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, + "latin1=\xC3\xA9-euro=\xE2\x82\xAC.")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + static PyMethodDef TestMethods[] = { {"unicode_new", unicode_new, METH_VARARGS}, {"unicode_fill", unicode_fill, METH_VARARGS}, @@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] = { {"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS}, {"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS}, {"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS}, + {"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS}, + {"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS}, {"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS}, {"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS}, + {"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bca2ab1..74a7438 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1374,46 +1374,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) return obj; } -#if SIZEOF_WCHAR_T == 2 -/* Helper function to convert a 16-bits wchar_t representation to UCS4, this - will decode surrogate pairs, the other conversions are implemented as macros - for efficiency. - - This function assumes that unicode can hold one more code point than wstr - characters for a terminating null character. */ -static void -unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, - PyObject *unicode) -{ - const wchar_t *iter; - Py_UCS4 *ucs4_out; - - assert(unicode != NULL); - assert(_PyUnicode_CHECK(unicode)); - assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); - ucs4_out = PyUnicode_4BYTE_DATA(unicode); - - for (iter = begin; iter < end; ) { - assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + - _PyUnicode_GET_LENGTH(unicode))); - if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) - && (iter+1) < end - && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) - { - *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); - iter += 2; - } - else { - *ucs4_out++ = *iter; - iter++; - } - } - assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + - _PyUnicode_GET_LENGTH(unicode))); - -} -#endif - static int unicode_check_modifiable(PyObject *unicode) { @@ -1937,6 +1897,62 @@ unicode_char(Py_UCS4 ch) return unicode; } + +static inline void +unicode_write_widechar(int kind, void *data, + const wchar_t *u, Py_ssize_t size, + Py_ssize_t num_surrogates) +{ + switch (kind) { + case PyUnicode_1BYTE_KIND: + _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data); + break; + + case PyUnicode_2BYTE_KIND: +#if SIZEOF_WCHAR_T == 2 + memcpy(data, u, size * 2); +#else + _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data); +#endif + break; + + case PyUnicode_4BYTE_KIND: + { +#if SIZEOF_WCHAR_T == 2 + // Convert a 16-bits wchar_t representation to UCS4, this will decode + // surrogate pairs. + const wchar_t *end = u + size; + Py_UCS4 *ucs4_out = (Py_UCS4*)data; +# ifndef NDEBUG + Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates); +# endif + for (const wchar_t *iter = u; iter < end; ) { + assert(ucs4_out < ucs4_end); + if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) + && (iter+1) < end + && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) + { + *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); + iter += 2; + } + else { + *ucs4_out++ = *iter; + iter++; + } + } + assert(ucs4_out == ucs4_end); +#else + assert(num_surrogates == 0); + memcpy(data, u, size * 4); +#endif + break; + } + default: + Py_UNREACHABLE(); + } +} + + PyObject * PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) { @@ -1989,36 +2005,65 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) if (!unicode) return NULL; - switch (PyUnicode_KIND(unicode)) { - case PyUnicode_1BYTE_KIND: - _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, - u, u + size, PyUnicode_1BYTE_DATA(unicode)); - break; - case PyUnicode_2BYTE_KIND: -#if Py_UNICODE_SIZE == 2 - memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2); -#else - _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, - u, u + size, PyUnicode_2BYTE_DATA(unicode)); -#endif - break; - case PyUnicode_4BYTE_KIND: -#if SIZEOF_WCHAR_T == 2 - /* This is the only case which has to process surrogates, thus - a simple copy loop is not enough and we need a function. */ - unicode_convert_wchar_to_ucs4(u, u + size, unicode); -#else - assert(num_surrogates == 0); - memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4); + unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), + u, size, num_surrogates); + + return unicode_result(unicode); +} + + +int +PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, + const wchar_t *str, + Py_ssize_t size) +{ + _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; + + if (size < 0) { + size = wcslen(str); + } + + if (size == 0) { + return 0; + } + +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion to UCS-4 first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size); + if (!converted) { + return -1; + } + PyObject *unicode = _PyUnicode_FromUCS4(converted, size); + PyMem_Free(converted); + + int res = _PyUnicodeWriter_WriteStr(writer, unicode); + Py_DECREF(unicode); + return res; + } #endif - break; - default: - Py_UNREACHABLE(); + + Py_UCS4 maxchar = 0; + Py_ssize_t num_surrogates; + if (find_maxchar_surrogates(str, str + size, + &maxchar, &num_surrogates) == -1) { + return -1; } - return unicode_result(unicode); + if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) { + return -1; + } + + int kind = writer->kind; + void *data = (Py_UCS1*)writer->data + writer->pos * kind; + unicode_write_widechar(kind, data, str, size, num_surrogates); + + writer->pos += size - num_surrogates; + return 0; } + PyObject * PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) { @@ -13649,6 +13694,32 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, return res; } + +int +PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, + const char *string, + Py_ssize_t length, + const char *errors, + Py_ssize_t *consumed) +{ + if (length < 0) { + length = strlen(string); + } + + _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; + Py_ssize_t old_pos = _writer->pos; + int res = unicode_decode_utf8_writer(_writer, string, length, + _Py_ERROR_UNKNOWN, errors, consumed); + if (res < 0) { + _writer->pos = old_pos; + if (consumed) { + *consumed = 0; + } + } + return res; +} + + int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) -- cgit v0.12