diff options
author | Kumar Aditya <59607654+kumaraditya303@users.noreply.github.com> | 2022-04-18 14:18:27 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-18 14:18:27 (GMT) |
commit | 8c54c3dacccb12a712acaa48d86a54f9ee9e37b5 (patch) | |
tree | 429abc799a7e0e6269fb2bec5c607c69809bc850 | |
parent | a29f858124bc698f6604716b73306c65b63b5054 (diff) | |
download | cpython-8c54c3dacccb12a712acaa48d86a54f9ee9e37b5.zip cpython-8c54c3dacccb12a712acaa48d86a54f9ee9e37b5.tar.gz cpython-8c54c3dacccb12a712acaa48d86a54f9ee9e37b5.tar.bz2 |
gh-91576: Speed up iteration of strings (#91574)
-rw-r--r-- | Include/internal/pycore_unicodeobject.h | 1 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 31 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst | 1 | ||||
-rw-r--r-- | Objects/object.c | 1 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 51 |
5 files changed, 79 insertions, 6 deletions
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index c7f0605..75b9050 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -20,6 +20,7 @@ extern void _PyUnicode_Fini(PyInterpreterState *); extern void _PyUnicode_FiniTypes(PyInterpreterState *); extern void _PyStaticUnicode_Dealloc(PyObject *); +extern PyTypeObject _PyUnicodeASCIIIter_Type; /* other API */ diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index df7afd5..c98fabf 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -9,6 +9,7 @@ import _string import codecs import itertools import operator +import pickle import struct import sys import textwrap @@ -185,6 +186,36 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(next(it), "\u3333") self.assertRaises(StopIteration, next, it) + def test_iterators_invocation(self): + cases = [type(iter('abc')), type(iter('🚀'))] + for cls in cases: + with self.subTest(cls=cls): + self.assertRaises(TypeError, cls) + + def test_iteration(self): + cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"] + for case in cases: + with self.subTest(string=case): + self.assertEqual(case, "".join(iter(case))) + + def test_exhausted_iterator(self): + cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"] + for case in cases: + with self.subTest(case=case): + iterator = iter(case) + tuple(iterator) + self.assertRaises(StopIteration, next, iterator) + + def test_pickle_iterator(self): + cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"] + for case in cases: + with self.subTest(case=case): + for proto in range(pickle.HIGHEST_PROTOCOL + 1): + it = iter(case) + with self.subTest(proto=proto): + pickled = "".join(pickle.loads(pickle.dumps(it, proto))) + self.assertEqual(case, pickled) + def test_count(self): string_tests.CommonTest.test_count(self) # check mixed argument types diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst b/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst new file mode 100644 index 0000000..b792f3e --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst @@ -0,0 +1 @@ +Speed up iteration of ascii strings by 50%. Patch by Kumar Aditya. diff --git a/Objects/object.c b/Objects/object.c index 33dab5e..fe2d76f 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -1936,6 +1936,7 @@ static PyTypeObject* static_types[] = { &_PyNamespace_Type, &_PyNone_Type, &_PyNotImplemented_Type, + &_PyUnicodeASCIIIter_Type, &_PyUnion_Type, &_PyWeakref_CallableProxyType, &_PyWeakref_ProxyType, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d35a671..6b05c37 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -15697,7 +15697,7 @@ unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) static PyObject * unicodeiter_next(unicodeiterobject *it) { - PyObject *seq, *item; + PyObject *seq; assert(it != NULL); seq = it->it_seq; @@ -15709,10 +15709,8 @@ unicodeiter_next(unicodeiterobject *it) int kind = PyUnicode_KIND(seq); const void *data = PyUnicode_DATA(seq); Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); - item = PyUnicode_FromOrdinal(chr); - if (item != NULL) - ++it->it_index; - return item; + it->it_index++; + return unicode_char(chr); } it->it_seq = NULL; @@ -15721,6 +15719,29 @@ unicodeiter_next(unicodeiterobject *it) } static PyObject * +unicode_ascii_iter_next(unicodeiterobject *it) +{ + assert(it != NULL); + PyObject *seq = it->it_seq; + if (seq == NULL) { + return NULL; + } + assert(_PyUnicode_CHECK(seq)); + assert(PyUnicode_IS_COMPACT_ASCII(seq)); + if (it->it_index < PyUnicode_GET_LENGTH(seq)) { + const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1)); + Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND, + data, it->it_index); + it->it_index++; + PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr]; + return Py_NewRef(item); + } + it->it_seq = NULL; + Py_DECREF(seq); + return NULL; +} + +static PyObject * unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored)) { Py_ssize_t len = 0; @@ -15808,6 +15829,19 @@ PyTypeObject PyUnicodeIter_Type = { 0, }; +PyTypeObject _PyUnicodeASCIIIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + .tp_name = "str_ascii_iterator", + .tp_basicsize = sizeof(unicodeiterobject), + .tp_dealloc = (destructor)unicodeiter_dealloc, + .tp_getattro = PyObject_GenericGetAttr, + .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, + .tp_traverse = (traverseproc)unicodeiter_traverse, + .tp_iter = PyObject_SelfIter, + .tp_iternext = (iternextfunc)unicode_ascii_iter_next, + .tp_methods = unicodeiter_methods, +}; + static PyObject * unicode_iter(PyObject *seq) { @@ -15819,7 +15853,12 @@ unicode_iter(PyObject *seq) } if (PyUnicode_READY(seq) == -1) return NULL; - it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); + if (PyUnicode_IS_COMPACT_ASCII(seq)) { + it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type); + } + else { + it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); + } if (it == NULL) return NULL; it->it_index = 0; |