diff options
-rw-r--r-- | Doc/c-api/unicode.rst | 9 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 45 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Modules/_testcapimodule.c | 34 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 10 |
5 files changed, 94 insertions, 7 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 261a43c..3f6a604 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -578,13 +578,16 @@ APIs: .. versionadded:: 3.3 -.. c:function:: int PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, \ - PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) +.. c:function:: Py_ssize_t PyUnicode_CopyCharacters(PyObject *to, \ + Py_ssize_t to_start, \ + PyObject *from, \ + Py_ssize_t from_start, \ + Py_ssize_t how_many) Copy characters from one Unicode object into another. This function performs character conversion when necessary and falls back to :c:func:`memcpy` if possible. Returns ``-1`` and sets an exception on error, otherwise returns - ``0``. + the number of copied characters. .. versionadded:: 3.3 diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 6086bb8..81e49d6 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -4,7 +4,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import _string import codecs import itertools @@ -2704,6 +2704,49 @@ class CAPITest(unittest.TestCase): self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0') self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff') + # Test PyUnicode_CopyCharacters() + @support.cpython_only + def test_copycharacters(self): + from _testcapi import unicode_copycharacters + + strings = [ + 'abcde', '\xa1\xa2\xa3\xa4\xa5', + '\u4f60\u597d\u4e16\u754c\uff01', + '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' + ] + + for idx, from_ in enumerate(strings): + # wide -> narrow: exceed maxchar limitation + for to in strings[:idx]: + self.assertRaises( + SystemError, + unicode_copycharacters, to, 0, from_, 0, 5 + ) + # same kind + for from_start in range(5): + self.assertEqual( + unicode_copycharacters(from_, 0, from_, from_start, 5), + (from_[from_start:from_start+5].ljust(5, '\0'), + 5-from_start) + ) + for to_start in range(5): + self.assertEqual( + unicode_copycharacters(from_, to_start, from_, to_start, 5), + (from_[to_start:to_start+5].rjust(5, '\0'), + 5-to_start) + ) + # narrow -> wide + # Tests omitted since this creates invalid strings. + + s = strings[0] + self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) + self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) + self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) + self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) + @support.cpython_only def test_encode_decimal(self): from _testcapi import unicode_encodedecimal @@ -10,6 +10,9 @@ Release date: TBA Core and Builtins ----------------- +- Issue #28379: Added sanity checks and tests for PyUnicode_CopyCharacters(). + Patch by Xiang Zhang. + - Issue #28376: The type of long range iterator is now registered as Iterator. Patch by Oren Milman. diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index f4a1e97..4ecd48a 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -1860,6 +1860,39 @@ unicode_asucs4(PyObject *self, PyObject *args) } static PyObject * +unicode_copycharacters(PyObject *self, PyObject *args) +{ + PyObject *from, *to, *to_copy; + Py_ssize_t from_start, to_start, how_many, copied; + + if (!PyArg_ParseTuple(args, "UnOnn:unicode_copycharacters", &to, &to_start, + &from, &from_start, &how_many)) { + return NULL; + } + + if (PyUnicode_READY(to) < 0) { + return NULL; + } + + if (!(to_copy = PyUnicode_New(PyUnicode_GET_LENGTH(to), + PyUnicode_MAX_CHAR_VALUE(to)))) { + return NULL; + } + if (PyUnicode_Fill(to_copy, 0, PyUnicode_GET_LENGTH(to_copy), 0U) < 0) { + Py_DECREF(to_copy); + return NULL; + } + + if ((copied = PyUnicode_CopyCharacters(to_copy, to_start, from, + from_start, how_many)) < 0) { + Py_DECREF(to_copy); + return NULL; + } + + return Py_BuildValue("(Nn)", to_copy, copied); +} + +static PyObject * unicode_encodedecimal(PyObject *self, PyObject *args) { Py_UNICODE *unicode; @@ -3915,6 +3948,7 @@ static PyMethodDef TestMethods[] = { {"unicode_aswidechar", unicode_aswidechar, METH_VARARGS}, {"unicode_aswidecharstring",unicode_aswidecharstring, METH_VARARGS}, {"unicode_asucs4", unicode_asucs4, METH_VARARGS}, + {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS}, {"unicode_transformdecimaltoascii", unicode_transformdecimaltoascii, METH_VARARGS}, {"unicode_legacy_string", unicode_legacy_string, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 176ec13..b734eec 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1366,15 +1366,19 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, if (PyUnicode_READY(to) == -1) return -1; - if (from_start < 0) { + if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return -1; } - if (to_start < 0) { + if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return -1; } - how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); + if (how_many < 0) { + PyErr_SetString(PyExc_SystemError, "how_many cannot be negative"); + return -1; + } + how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many); if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { PyErr_Format(PyExc_SystemError, "Cannot write %zi characters at %zi " |