summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2024-06-21 17:33:15 (GMT)
committerGitHub <noreply@github.com>2024-06-21 17:33:15 (GMT)
commit4123226bbda437b64b60c9111d3e4cc9dc76f7d5 (patch)
tree6bb6ff8c51815b10afceb455a10912ed43ca38c5
parentaed31beca9a54b85a1392631a48da80602210f18 (diff)
downloadcpython-4123226bbda437b64b60c9111d3e4cc9dc76f7d5.zip
cpython-4123226bbda437b64b60c9111d3e4cc9dc76f7d5.tar.gz
cpython-4123226bbda437b64b60c9111d3e4cc9dc76f7d5.tar.bz2
gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() (#120639)
Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions. Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
-rw-r--r--Doc/c-api/unicode.rst35
-rw-r--r--Doc/whatsnew/3.14.rst2
-rw-r--r--Include/cpython/unicodeobject.h10
-rw-r--r--Modules/_testcapi/unicode.c152
-rw-r--r--Objects/unicodeobject.c201
5 files changed, 332 insertions, 68 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 02e696c..4ea20bd 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -1551,9 +1551,17 @@ object.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
- To use a different error handler than ``strict``,
- :c:func:`PyUnicode_DecodeUTF8` can be used with
- :c:func:`PyUnicodeWriter_WriteStr`.
+ See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
+
+.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size)
+
+ Writer the wide string *str* into *writer*.
+
+ *size* is a number of wide characters. If *size* is equal to ``-1``, call
+ ``wcslen(str)`` to get the string length.
+
+ On success, return ``0``.
+ On error, set an exception, leave the writer unchanged, and return ``-1``.
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
@@ -1586,3 +1594,24 @@ object.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
+
+.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
+
+ Decode the string *str* from UTF-8 with *errors* error handler and write the
+ output into *writer*.
+
+ *size* is the string length in bytes. If *size* is equal to ``-1``, call
+ ``strlen(str)`` to get the string length.
+
+ *errors* is an error handler name, such as ``"replace"``. If *errors* is
+ ``NULL``, use the strict error handler.
+
+ If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
+ bytes on success.
+ If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
+ as an error.
+
+ On success, return ``0``.
+ On error, set an exception, leave the writer unchanged, and return ``-1``.
+
+ See also :c:func:`PyUnicodeWriter_WriteUTF8`.
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
index 804d39a..2eefa23 100644
--- a/Doc/whatsnew/3.14.rst
+++ b/Doc/whatsnew/3.14.rst
@@ -291,10 +291,12 @@ New Features
* :c:func:`PyUnicodeWriter_Finish`.
* :c:func:`PyUnicodeWriter_WriteChar`.
* :c:func:`PyUnicodeWriter_WriteUTF8`.
+ * :c:func:`PyUnicodeWriter_WriteWideChar`.
* :c:func:`PyUnicodeWriter_WriteStr`.
* :c:func:`PyUnicodeWriter_WriteRepr`.
* :c:func:`PyUnicodeWriter_WriteSubstring`.
* :c:func:`PyUnicodeWriter_Format`.
+ * :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
(Contributed by Victor Stinner in :gh:`119182`.)
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index e5e1b6b..059bec8 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
PyUnicodeWriter *writer,
const char *str,
Py_ssize_t size);
+PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
+ PyUnicodeWriter *writer,
+ const wchar_t *str,
+ Py_ssize_t size);
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
PyUnicodeWriter *writer,
@@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
PyUnicodeWriter *writer,
const char *format,
...);
+PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
+ PyUnicodeWriter *writer,
+ const char *string, /* UTF-8 encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ Py_ssize_t *consumed); /* bytes consumed */
/* --- Private _PyUnicodeWriter API --------------------------------------- */
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
index 79f99c4..da658b4 100644
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -375,6 +375,119 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
static PyObject *
+test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
+{
+ // test PyUnicodeWriter_DecodeUTF8Stateful()
+ PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+ if (writer == NULL) {
+ return NULL;
+ }
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+ goto error;
+ }
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+ goto error;
+ }
+
+ // incomplete trailing UTF-8 sequence
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) {
+ goto error;
+ }
+
+ PyObject *result = PyUnicodeWriter_Finish(writer);
+ if (result == NULL) {
+ return NULL;
+ }
+ assert(PyUnicode_EqualToUTF8(result,
+ "ignore-replace\xef\xbf\xbd"
+ "-incomplete\xef\xbf\xbd"));
+ Py_DECREF(result);
+
+ Py_RETURN_NONE;
+
+error:
+ PyUnicodeWriter_Discard(writer);
+ return NULL;
+}
+
+
+static PyObject *
+test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
+{
+ // test PyUnicodeWriter_DecodeUTF8Stateful()
+ PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+ if (writer == NULL) {
+ return NULL;
+ }
+ Py_ssize_t consumed;
+
+ // valid string
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
+ goto error;
+ }
+ assert(consumed == 4);
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+ goto error;
+ }
+
+ // non-ASCII
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) {
+ goto error;
+ }
+ assert(consumed == 6);
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+ goto error;
+ }
+
+ // consumed is 0 if write fails
+ consumed = 12345;
+ assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
+ PyErr_Clear();
+ assert(consumed == 0);
+
+ // ignore error handler
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
+ goto error;
+ }
+ assert(consumed == 5);
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+ goto error;
+ }
+
+ // incomplete trailing UTF-8 sequence
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) {
+ goto error;
+ }
+ assert(consumed == 10);
+
+ PyObject *result = PyUnicodeWriter_Finish(writer);
+ if (result == NULL) {
+ return NULL;
+ }
+ assert(PyUnicode_EqualToUTF8(result,
+ "text-\xC3\xA9-\xE2\x82\xAC-"
+ "more-incomplete"));
+ Py_DECREF(result);
+
+ Py_RETURN_NONE;
+
+error:
+ PyUnicodeWriter_Discard(writer);
+ return NULL;
+}
+
+
+static PyObject *
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
{
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
@@ -436,6 +549,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
}
+static PyObject *
+test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
+{
+ PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+ if (writer == NULL) {
+ return NULL;
+ }
+ if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
+ goto error;
+ }
+
+ PyObject *result = PyUnicodeWriter_Finish(writer);
+ if (result == NULL) {
+ return NULL;
+ }
+ assert(PyUnicode_EqualToUTF8(result,
+ "latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
+ Py_DECREF(result);
+
+ Py_RETURN_NONE;
+
+error:
+ PyUnicodeWriter_Discard(writer);
+ return NULL;
+}
+
+
static PyMethodDef TestMethods[] = {
{"unicode_new", unicode_new, METH_VARARGS},
{"unicode_fill", unicode_fill, METH_VARARGS},
@@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] = {
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
+ {"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
+ {"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
+ {"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
{NULL},
};
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index bca2ab1..74a7438 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1374,46 +1374,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
return obj;
}
-#if SIZEOF_WCHAR_T == 2
-/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
- will decode surrogate pairs, the other conversions are implemented as macros
- for efficiency.
-
- This function assumes that unicode can hold one more code point than wstr
- characters for a terminating null character. */
-static void
-unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
- PyObject *unicode)
-{
- const wchar_t *iter;
- Py_UCS4 *ucs4_out;
-
- assert(unicode != NULL);
- assert(_PyUnicode_CHECK(unicode));
- assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
- ucs4_out = PyUnicode_4BYTE_DATA(unicode);
-
- for (iter = begin; iter < end; ) {
- assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
- _PyUnicode_GET_LENGTH(unicode)));
- if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
- && (iter+1) < end
- && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
- {
- *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
- iter += 2;
- }
- else {
- *ucs4_out++ = *iter;
- iter++;
- }
- }
- assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
- _PyUnicode_GET_LENGTH(unicode)));
-
-}
-#endif
-
static int
unicode_check_modifiable(PyObject *unicode)
{
@@ -1937,6 +1897,62 @@ unicode_char(Py_UCS4 ch)
return unicode;
}
+
+static inline void
+unicode_write_widechar(int kind, void *data,
+ const wchar_t *u, Py_ssize_t size,
+ Py_ssize_t num_surrogates)
+{
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND:
+ _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
+ break;
+
+ case PyUnicode_2BYTE_KIND:
+#if SIZEOF_WCHAR_T == 2
+ memcpy(data, u, size * 2);
+#else
+ _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
+#endif
+ break;
+
+ case PyUnicode_4BYTE_KIND:
+ {
+#if SIZEOF_WCHAR_T == 2
+ // Convert a 16-bits wchar_t representation to UCS4, this will decode
+ // surrogate pairs.
+ const wchar_t *end = u + size;
+ Py_UCS4 *ucs4_out = (Py_UCS4*)data;
+# ifndef NDEBUG
+ Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
+# endif
+ for (const wchar_t *iter = u; iter < end; ) {
+ assert(ucs4_out < ucs4_end);
+ if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
+ && (iter+1) < end
+ && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
+ {
+ *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
+ iter += 2;
+ }
+ else {
+ *ucs4_out++ = *iter;
+ iter++;
+ }
+ }
+ assert(ucs4_out == ucs4_end);
+#else
+ assert(num_surrogates == 0);
+ memcpy(data, u, size * 4);
+#endif
+ break;
+ }
+ default:
+ Py_UNREACHABLE();
+ }
+}
+
+
PyObject *
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
{
@@ -1989,36 +2005,65 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
if (!unicode)
return NULL;
- switch (PyUnicode_KIND(unicode)) {
- case PyUnicode_1BYTE_KIND:
- _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
- u, u + size, PyUnicode_1BYTE_DATA(unicode));
- break;
- case PyUnicode_2BYTE_KIND:
-#if Py_UNICODE_SIZE == 2
- memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
-#else
- _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
- u, u + size, PyUnicode_2BYTE_DATA(unicode));
-#endif
- break;
- case PyUnicode_4BYTE_KIND:
-#if SIZEOF_WCHAR_T == 2
- /* This is the only case which has to process surrogates, thus
- a simple copy loop is not enough and we need a function. */
- unicode_convert_wchar_to_ucs4(u, u + size, unicode);
-#else
- assert(num_surrogates == 0);
- memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
+ unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
+ u, size, num_surrogates);
+
+ return unicode_result(unicode);
+}
+
+
+int
+PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
+ const wchar_t *str,
+ Py_ssize_t size)
+{
+ _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
+
+ if (size < 0) {
+ size = wcslen(str);
+ }
+
+ if (size == 0) {
+ return 0;
+ }
+
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+ /* Oracle Solaris uses non-Unicode internal wchar_t form for
+ non-Unicode locales and hence needs conversion to UCS-4 first. */
+ if (_Py_LocaleUsesNonUnicodeWchar()) {
+ wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
+ if (!converted) {
+ return -1;
+ }
+ PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
+ PyMem_Free(converted);
+
+ int res = _PyUnicodeWriter_WriteStr(writer, unicode);
+ Py_DECREF(unicode);
+ return res;
+ }
#endif
- break;
- default:
- Py_UNREACHABLE();
+
+ Py_UCS4 maxchar = 0;
+ Py_ssize_t num_surrogates;
+ if (find_maxchar_surrogates(str, str + size,
+ &maxchar, &num_surrogates) == -1) {
+ return -1;
}
- return unicode_result(unicode);
+ if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
+ return -1;
+ }
+
+ int kind = writer->kind;
+ void *data = (Py_UCS1*)writer->data + writer->pos * kind;
+ unicode_write_widechar(kind, data, str, size, num_surrogates);
+
+ writer->pos += size - num_surrogates;
+ return 0;
}
+
PyObject *
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
@@ -13649,6 +13694,32 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
return res;
}
+
+int
+PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
+ const char *string,
+ Py_ssize_t length,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ if (length < 0) {
+ length = strlen(string);
+ }
+
+ _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
+ Py_ssize_t old_pos = _writer->pos;
+ int res = unicode_decode_utf8_writer(_writer, string, length,
+ _Py_ERROR_UNKNOWN, errors, consumed);
+ if (res < 0) {
+ _writer->pos = old_pos;
+ if (consumed) {
+ *consumed = 0;
+ }
+ }
+ return res;
+}
+
+
int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
const char *str, Py_ssize_t len)