From d731579bfb9a497cfb0076cb6b221058a20088fe Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 20 Oct 2023 17:59:29 +0200 Subject: gh-111089: PyUnicode_AsUTF8() now raises on embedded NUL (#111091) * PyUnicode_AsUTF8() now raises an exception if the string contains embedded null characters. * Update related C API tests (test_capi.test_unicode). * type_new_set_doc() uses PyUnicode_AsUTF8AndSize() to silently truncate doc containing null bytes. Co-authored-by: Serhiy Storchaka --- Doc/c-api/unicode.rst | 8 ++++++++ Doc/whatsnew/3.13.rst | 6 ++++++ Include/cpython/unicodeobject.h | 20 ++++++++++---------- Include/unicodeobject.h | 20 +++++++++----------- Lib/test/test_capi/test_unicode.py | 5 ++++- .../2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst | 2 ++ Objects/typeobject.c | 5 +++-- Objects/unicodeobject.c | 8 +++++++- 8 files changed, 49 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 5ab9f1c..d17e63d 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -992,11 +992,19 @@ These are the UTF-8 codec APIs: As :c:func:`PyUnicode_AsUTF8AndSize`, but does not store the size. + Raise an exception if the *unicode* string contains embedded null + characters. To accept embedded null characters and truncate on purpose + at the first null byte, ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be + used instead. + .. versionadded:: 3.3 .. versionchanged:: 3.7 The return type is now ``const char *`` rather of ``char *``. + .. versionchanged:: 3.13 + Raise an exception if the string contains embedded null characters. + UTF-32 Codecs """"""""""""" diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 804c7a8..34e4d67 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1109,6 +1109,12 @@ Porting to Python 3.13 are now undefined by ````. (Contributed by Victor Stinner in :gh:`85283`.) +* The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string + contains embedded null characters. To accept embedded null characters and + truncate on purpose at the first null byte, + ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead. + (Contributed by Victor Stinner in :gh:`111089`.) + Deprecated ---------- diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 859ab71..d67553c 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -442,18 +442,18 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( /* --- Manage the default encoding ---------------------------------------- */ -/* Returns a pointer to the default encoding (UTF-8) of the - Unicode object unicode. - - Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation - in the unicodeobject. - - Use of this API is DEPRECATED since no size information can be - extracted from the returned data. -*/ - +// Returns a pointer to the default encoding (UTF-8) of the +// Unicode object unicode. +// +// Raise an exception if the string contains embedded null characters. +// Use PyUnicode_AsUTF8AndSize() to accept embedded null characters. +// +// This function caches the UTF-8 encoded string in the Unicode object +// and subsequent calls will return the same string. The memory is released +// when the Unicode object is deallocated. PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); + /* === Characters Type APIs =============================================== */ /* These should not be used directly. Use the Py_UNICODE_IS* and diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index dee0071..1e5753d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -443,17 +443,15 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( PyObject *unicode /* Unicode object */ ); -/* Returns a pointer to the default encoding (UTF-8) of the - Unicode object unicode and the size of the encoded representation - in bytes stored in *size. - - In case of an error, no *size is set. - - This function caches the UTF-8 encoded string in the unicodeobject - and subsequent calls will return the same string. The memory is released - when the unicodeobject is deallocated. -*/ - +// Returns a pointer to the default encoding (UTF-8) of the +// Unicode object unicode and the size of the encoded representation +// in bytes stored in `*size` (if size is not NULL). +// +// On error, `*size` is set to 0 (if size is not NULL). +// +// This function caches the UTF-8 encoded string in the Unicode object +// and subsequent calls will return the same string. The memory is released +// when the Unicode object is deallocated. #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( PyObject *unicode, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 3ec27a2..8ab5590 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -882,7 +882,10 @@ class CAPITest(unittest.TestCase): self.assertEqual(unicode_asutf8('abc', 4), b'abc\0') self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0') self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0') - self.assertEqual(unicode_asutf8('abc\0def', 8), b'abc\0def\0') + + # disallow embedded null characters + self.assertRaises(ValueError, unicode_asutf8, 'abc\0', 0) + self.assertRaises(ValueError, unicode_asutf8, 'abc\0def', 0) self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0) self.assertRaises(TypeError, unicode_asutf8, b'abc', 0) diff --git a/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst new file mode 100644 index 0000000..2008dd5 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst @@ -0,0 +1,2 @@ +The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the +string contains embedded null characters. Patch by Victor Stinner. diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 3261a14..2508569 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -3499,13 +3499,14 @@ type_new_set_doc(PyTypeObject *type) return 0; } - const char *doc_str = PyUnicode_AsUTF8(doc); + Py_ssize_t doc_size; + const char *doc_str = PyUnicode_AsUTF8AndSize(doc, &doc_size); if (doc_str == NULL) { return -1; } // Silently truncate the docstring if it contains a null byte - Py_ssize_t size = strlen(doc_str) + 1; + Py_ssize_t size = doc_size + 1; char *tp_doc = (char *)PyObject_Malloc(size); if (tp_doc == NULL) { PyErr_NoMemory(); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 33cbc98..07d1b6e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3837,7 +3837,13 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) const char * PyUnicode_AsUTF8(PyObject *unicode) { - return PyUnicode_AsUTF8AndSize(unicode, NULL); + Py_ssize_t size; + const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size); + if (utf8 != NULL && strlen(utf8) != (size_t)size) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + return NULL; + } + return utf8; } /* -- cgit v0.12