bpo-39087: Add _PyUnicode_GetUTF8Buffer() (GH-17659)

Co-authored-by: Victor Stinner <vstinner@python.org>
author: Inada Naoki <songofacandy@gmail.com> 2020-03-14 03:43:18 (GMT)
committer: GitHub <noreply@github.com> 2020-03-14 03:43:18 (GMT)
commit: c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b (patch)
tree: 2026fd46b762fb2deaf9091e4d7e09dc198bc2d3 /Modules/_testcapimodule.c
parent: 8fb02b6e1942811c8d81041e7df3f5f1f4b1d410 (diff)
download: cpython-c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b.zip
cpython-c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b.tar.gz
cpython-c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b.tar.bz2
1 files changed, 212 insertions, 0 deletions
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index 3cc5586..09b7706 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -1968,6 +1968,216 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
 }
 
 static PyObject *
+unicode_getutf8buffer(PyObject *self, PyObject *args)
+{
+    PyObject *unicode;
+    const char *errors = NULL;
+    if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) {
+        return NULL;
+    }
+
+    Py_buffer buffer;
+    if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) {
+        return NULL;
+    }
+
+    assert(buffer.obj != NULL);
+    assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj));
+
+    PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len);
+    PyBuffer_Release(&buffer);
+    return result;
+}
+
+static PyObject *
+unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    Py_buffer buf;
+
+    // Test 1: ASCII string
+    PyObject *str = PyUnicode_FromString("hello");
+    if (str == NULL) {
+        return NULL;
+    }
+    Py_ssize_t refcnt = Py_REFCNT(str);
+
+    // _PyUnicode_GetUTF8Buffer() must not fail for ASCII string.
+    int ret = _PyUnicode_GetUTF8Buffer(str, NULL,  &buf);
+    assert(ret == 0);
+
+    if (buf.obj != str) {
+        PyErr_Format(TestError,
+                     "buf.obj must be equal to str. (%s:%d)",
+                     __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+
+    if (buf.len != PyUnicode_GET_LENGTH(str)) {
+        PyErr_Format(TestError,
+                     "buf.len must be equal to len(str). (%s:%d)",
+                     __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+    assert(((const char*)buf.buf)[5] == '\0');
+
+    if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) {
+        PyErr_Format(TestError,
+                     "buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)",
+                     __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+
+    if (refcnt + 1 != Py_REFCNT(str)) {
+        PyErr_Format(TestError,
+                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
+                     refcnt + 1, Py_REFCNT(str),
+                     __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+
+    PyBuffer_Release(&buf);
+
+    if (refcnt != Py_REFCNT(str)) {
+        PyErr_Format(TestError,
+                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
+                     refcnt, Py_REFCNT(str),
+                     __FILE__, __LINE__);
+        Py_DECREF(str);
+        return NULL;
+    }
+
+    Py_DECREF(str);
+
+    // Test 2: non-ASCII string
+
+    // "hello" in Japanese.  len(str)==5, len(str.encode()) == 15.
+    str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf");
+    if (str == NULL) {
+        return NULL;
+    }
+    refcnt = Py_REFCNT(str);
+    assert(PyUnicode_GET_LENGTH(str) == 5);
+
+    if (_PyUnicode_GetUTF8Buffer(str, NULL,  &buf) < 0) {
+        Py_DECREF(str);
+        if (!PyErr_Occurred()) {
+            PyErr_Format(TestError,
+                         "_PyUnicode_GetUTF8Buffer() returned nonzero "
+                         "without exception set. (%s:%d)",
+                         __FILE__, __LINE__);
+        }
+        return NULL;
+    }
+
+    if (!PyBytes_CheckExact(buf.obj)) {
+        PyErr_Format(TestError,
+                     "buf.obj must be a bytes object, got %R (%s:%d)",
+                     buf.obj, __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+
+    if (buf.len != 15) {
+        PyErr_Format(TestError,
+                     "Expected buf.len == 15, actual %zd (%s:%d)",
+                     buf.len, __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+    assert(((const char*)buf.buf)[15] == '\0');
+
+    if (refcnt != Py_REFCNT(str)) {
+        PyErr_Format(TestError,
+                     "Py_REFCNT(str) must not be changed. (%s:%d)",
+                     __FILE__, __LINE__);
+        // Do not DECREF here because refcnt is broken.
+        return NULL;
+    }
+
+    PyBuffer_Release(&buf);
+
+    // Test 3: There is a UTF-8 cache
+    // Reuse str of the previoss test.
+
+    const char *cache = PyUnicode_AsUTF8(str);
+    if (cache == NULL) {
+        return NULL;
+    }
+
+    if (_PyUnicode_GetUTF8Buffer(str, NULL,  &buf) < 0) {
+        Py_DECREF(str);
+        if (!PyErr_Occurred()) {
+            PyErr_Format(TestError,
+                         "_PyUnicode_GetUTF8Buffer() returned nonzero "
+                         "without exception set. (%s:%d)",
+                         __FILE__, __LINE__);
+        }
+        return NULL;
+    }
+
+    if (buf.obj != str) {
+        PyErr_Format(TestError,
+                     "buf.obj must be equal to str. (%s:%d)",
+                     __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+
+    if (buf.buf != cache) {
+        PyErr_Format(TestError,
+                     "buf.buf must be equal to the UTF-8 cache (%s:%d)",
+                     __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+
+    if (buf.len != 15) {
+        PyErr_Format(TestError,
+                     "Expected buf.len == 15, actual %zd (%s:%d)",
+                     buf.len, __FILE__, __LINE__);
+        PyBuffer_Release(&buf);
+        Py_DECREF(str);
+        return NULL;
+    }
+    assert(((const char*)buf.buf)[15] == '\0');
+
+    if (refcnt + 1 != Py_REFCNT(str)) {
+        PyErr_Format(TestError,
+                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
+                     refcnt + 1, Py_REFCNT(str),
+                     __FILE__, __LINE__);
+        // Do not DECREF here because refcnt is broken.
+        return NULL;
+    }
+
+    PyBuffer_Release(&buf);
+
+    if (refcnt != Py_REFCNT(str)) {
+        PyErr_Format(TestError,
+                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
+                     refcnt, Py_REFCNT(str),
+                     __FILE__, __LINE__);
+        // Do not DECREF here because refcnt is broken.
+        return NULL;
+    }
+
+    Py_DECREF(str);
+    Py_RETURN_NONE;
+}
+
+static PyObject *
 unicode_findchar(PyObject *self, PyObject *args)
 {
     PyObject *str;
@@ -5392,6 +5602,8 @@ static PyMethodDef TestMethods[] = {
     {"unicode_asucs4",          unicode_asucs4,                  METH_VARARGS},
     {"unicode_asutf8",          unicode_asutf8,                  METH_VARARGS},
     {"unicode_asutf8andsize",   unicode_asutf8andsize,           METH_VARARGS},
+    {"unicode_getutf8buffer",   unicode_getutf8buffer,           METH_VARARGS},
+    {"unicode_test_getutf8buffer", unicode_test_getutf8buffer,   METH_NOARGS},
     {"unicode_findchar",        unicode_findchar,                METH_VARARGS},
     {"unicode_copycharacters",  unicode_copycharacters,          METH_VARARGS},
     {"unicode_encodedecimal",   unicode_encodedecimal,           METH_VARARGS},
author	Inada Naoki <songofacandy@gmail.com>	2020-03-14 03:43:18 (GMT)
committer	GitHub <noreply@github.com>	2020-03-14 03:43:18 (GMT)
commit	c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b (patch)
tree	2026fd46b762fb2deaf9091e4d7e09dc198bc2d3 /Modules/_testcapimodule.c
parent	8fb02b6e1942811c8d81041e7df3f5f1f4b1d410 (diff)
download	cpython-c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b.zip cpython-c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b.tar.gz cpython-c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b.tar.bz2