summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2023-10-20 15:59:29 (GMT)
committerGitHub <noreply@github.com>2023-10-20 15:59:29 (GMT)
commitd731579bfb9a497cfb0076cb6b221058a20088fe (patch)
tree63eb8f9b8dfb7af9a5de6d4499ab375b80f208d4
parent59ea0f523e155ac1a471cd292b41a76241fccd36 (diff)
downloadcpython-d731579bfb9a497cfb0076cb6b221058a20088fe.zip
cpython-d731579bfb9a497cfb0076cb6b221058a20088fe.tar.gz
cpython-d731579bfb9a497cfb0076cb6b221058a20088fe.tar.bz2
gh-111089: PyUnicode_AsUTF8() now raises on embedded NUL (#111091)
* PyUnicode_AsUTF8() now raises an exception if the string contains embedded null characters. * Update related C API tests (test_capi.test_unicode). * type_new_set_doc() uses PyUnicode_AsUTF8AndSize() to silently truncate doc containing null bytes. Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
-rw-r--r--Doc/c-api/unicode.rst8
-rw-r--r--Doc/whatsnew/3.13.rst6
-rw-r--r--Include/cpython/unicodeobject.h20
-rw-r--r--Include/unicodeobject.h20
-rw-r--r--Lib/test/test_capi/test_unicode.py5
-rw-r--r--Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst2
-rw-r--r--Objects/typeobject.c5
-rw-r--r--Objects/unicodeobject.c8
8 files changed, 49 insertions, 25 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 5ab9f1c..d17e63d 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -992,11 +992,19 @@ These are the UTF-8 codec APIs:
As :c:func:`PyUnicode_AsUTF8AndSize`, but does not store the size.
+ Raise an exception if the *unicode* string contains embedded null
+ characters. To accept embedded null characters and truncate on purpose
+ at the first null byte, ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be
+ used instead.
+
.. versionadded:: 3.3
.. versionchanged:: 3.7
The return type is now ``const char *`` rather of ``char *``.
+ .. versionchanged:: 3.13
+ Raise an exception if the string contains embedded null characters.
+
UTF-32 Codecs
"""""""""""""
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
index 804c7a8..34e4d67 100644
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -1109,6 +1109,12 @@ Porting to Python 3.13
are now undefined by ``<Python.h>``.
(Contributed by Victor Stinner in :gh:`85283`.)
+* The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string
+ contains embedded null characters. To accept embedded null characters and
+ truncate on purpose at the first null byte,
+ ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead.
+ (Contributed by Victor Stinner in :gh:`111089`.)
+
Deprecated
----------
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index 859ab71..d67553c 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -442,18 +442,18 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
/* --- Manage the default encoding ---------------------------------------- */
-/* Returns a pointer to the default encoding (UTF-8) of the
- Unicode object unicode.
-
- Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
- in the unicodeobject.
-
- Use of this API is DEPRECATED since no size information can be
- extracted from the returned data.
-*/
-
+// Returns a pointer to the default encoding (UTF-8) of the
+// Unicode object unicode.
+//
+// Raise an exception if the string contains embedded null characters.
+// Use PyUnicode_AsUTF8AndSize() to accept embedded null characters.
+//
+// This function caches the UTF-8 encoded string in the Unicode object
+// and subsequent calls will return the same string. The memory is released
+// when the Unicode object is deallocated.
PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
+
/* === Characters Type APIs =============================================== */
/* These should not be used directly. Use the Py_UNICODE_IS* and
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index dee0071..1e5753d 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -443,17 +443,15 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
PyObject *unicode /* Unicode object */
);
-/* Returns a pointer to the default encoding (UTF-8) of the
- Unicode object unicode and the size of the encoded representation
- in bytes stored in *size.
-
- In case of an error, no *size is set.
-
- This function caches the UTF-8 encoded string in the unicodeobject
- and subsequent calls will return the same string. The memory is released
- when the unicodeobject is deallocated.
-*/
-
+// Returns a pointer to the default encoding (UTF-8) of the
+// Unicode object unicode and the size of the encoded representation
+// in bytes stored in `*size` (if size is not NULL).
+//
+// On error, `*size` is set to 0 (if size is not NULL).
+//
+// This function caches the UTF-8 encoded string in the Unicode object
+// and subsequent calls will return the same string. The memory is released
+// when the Unicode object is deallocated.
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
PyObject *unicode,
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
index 3ec27a2..8ab5590 100644
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -882,7 +882,10 @@ class CAPITest(unittest.TestCase):
self.assertEqual(unicode_asutf8('abc', 4), b'abc\0')
self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0')
self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0')
- self.assertEqual(unicode_asutf8('abc\0def', 8), b'abc\0def\0')
+
+ # disallow embedded null characters
+ self.assertRaises(ValueError, unicode_asutf8, 'abc\0', 0)
+ self.assertRaises(ValueError, unicode_asutf8, 'abc\0def', 0)
self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0)
self.assertRaises(TypeError, unicode_asutf8, b'abc', 0)
diff --git a/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst
new file mode 100644
index 0000000..2008dd5
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst
@@ -0,0 +1,2 @@
+The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the
+string contains embedded null characters. Patch by Victor Stinner.
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 3261a14..2508569 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -3499,13 +3499,14 @@ type_new_set_doc(PyTypeObject *type)
return 0;
}
- const char *doc_str = PyUnicode_AsUTF8(doc);
+ Py_ssize_t doc_size;
+ const char *doc_str = PyUnicode_AsUTF8AndSize(doc, &doc_size);
if (doc_str == NULL) {
return -1;
}
// Silently truncate the docstring if it contains a null byte
- Py_ssize_t size = strlen(doc_str) + 1;
+ Py_ssize_t size = doc_size + 1;
char *tp_doc = (char *)PyObject_Malloc(size);
if (tp_doc == NULL) {
PyErr_NoMemory();
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 33cbc98..07d1b6e 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3837,7 +3837,13 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
const char *
PyUnicode_AsUTF8(PyObject *unicode)
{
- return PyUnicode_AsUTF8AndSize(unicode, NULL);
+ Py_ssize_t size;
+ const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size);
+ if (utf8 != NULL && strlen(utf8) != (size_t)size) {
+ PyErr_SetString(PyExc_ValueError, "embedded null character");
+ return NULL;
+ }
+ return utf8;
}
/*