summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKumar Aditya <59607654+kumaraditya303@users.noreply.github.com>2022-04-18 14:18:27 (GMT)
committerGitHub <noreply@github.com>2022-04-18 14:18:27 (GMT)
commit8c54c3dacccb12a712acaa48d86a54f9ee9e37b5 (patch)
tree429abc799a7e0e6269fb2bec5c607c69809bc850
parenta29f858124bc698f6604716b73306c65b63b5054 (diff)
downloadcpython-8c54c3dacccb12a712acaa48d86a54f9ee9e37b5.zip
cpython-8c54c3dacccb12a712acaa48d86a54f9ee9e37b5.tar.gz
cpython-8c54c3dacccb12a712acaa48d86a54f9ee9e37b5.tar.bz2
gh-91576: Speed up iteration of strings (#91574)
-rw-r--r--Include/internal/pycore_unicodeobject.h1
-rw-r--r--Lib/test/test_unicode.py31
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst1
-rw-r--r--Objects/object.c1
-rw-r--r--Objects/unicodeobject.c51
5 files changed, 79 insertions, 6 deletions
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
index c7f0605..75b9050 100644
--- a/Include/internal/pycore_unicodeobject.h
+++ b/Include/internal/pycore_unicodeobject.h
@@ -20,6 +20,7 @@ extern void _PyUnicode_Fini(PyInterpreterState *);
extern void _PyUnicode_FiniTypes(PyInterpreterState *);
extern void _PyStaticUnicode_Dealloc(PyObject *);
+extern PyTypeObject _PyUnicodeASCIIIter_Type;
/* other API */
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index df7afd5..c98fabf 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -9,6 +9,7 @@ import _string
import codecs
import itertools
import operator
+import pickle
import struct
import sys
import textwrap
@@ -185,6 +186,36 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEqual(next(it), "\u3333")
self.assertRaises(StopIteration, next, it)
+ def test_iterators_invocation(self):
+ cases = [type(iter('abc')), type(iter('🚀'))]
+ for cls in cases:
+ with self.subTest(cls=cls):
+ self.assertRaises(TypeError, cls)
+
+ def test_iteration(self):
+ cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
+ for case in cases:
+ with self.subTest(string=case):
+ self.assertEqual(case, "".join(iter(case)))
+
+ def test_exhausted_iterator(self):
+ cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
+ for case in cases:
+ with self.subTest(case=case):
+ iterator = iter(case)
+ tuple(iterator)
+ self.assertRaises(StopIteration, next, iterator)
+
+ def test_pickle_iterator(self):
+ cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
+ for case in cases:
+ with self.subTest(case=case):
+ for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+ it = iter(case)
+ with self.subTest(proto=proto):
+ pickled = "".join(pickle.loads(pickle.dumps(it, proto)))
+ self.assertEqual(case, pickled)
+
def test_count(self):
string_tests.CommonTest.test_count(self)
# check mixed argument types
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst b/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst
new file mode 100644
index 0000000..b792f3e
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst
@@ -0,0 +1 @@
+Speed up iteration of ascii strings by 50%. Patch by Kumar Aditya.
diff --git a/Objects/object.c b/Objects/object.c
index 33dab5e..fe2d76f 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -1936,6 +1936,7 @@ static PyTypeObject* static_types[] = {
&_PyNamespace_Type,
&_PyNone_Type,
&_PyNotImplemented_Type,
+ &_PyUnicodeASCIIIter_Type,
&_PyUnion_Type,
&_PyWeakref_CallableProxyType,
&_PyWeakref_ProxyType,
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index d35a671..6b05c37 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -15697,7 +15697,7 @@ unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
static PyObject *
unicodeiter_next(unicodeiterobject *it)
{
- PyObject *seq, *item;
+ PyObject *seq;
assert(it != NULL);
seq = it->it_seq;
@@ -15709,10 +15709,8 @@ unicodeiter_next(unicodeiterobject *it)
int kind = PyUnicode_KIND(seq);
const void *data = PyUnicode_DATA(seq);
Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
- item = PyUnicode_FromOrdinal(chr);
- if (item != NULL)
- ++it->it_index;
- return item;
+ it->it_index++;
+ return unicode_char(chr);
}
it->it_seq = NULL;
@@ -15721,6 +15719,29 @@ unicodeiter_next(unicodeiterobject *it)
}
static PyObject *
+unicode_ascii_iter_next(unicodeiterobject *it)
+{
+ assert(it != NULL);
+ PyObject *seq = it->it_seq;
+ if (seq == NULL) {
+ return NULL;
+ }
+ assert(_PyUnicode_CHECK(seq));
+ assert(PyUnicode_IS_COMPACT_ASCII(seq));
+ if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
+ const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
+ Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
+ data, it->it_index);
+ it->it_index++;
+ PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
+ return Py_NewRef(item);
+ }
+ it->it_seq = NULL;
+ Py_DECREF(seq);
+ return NULL;
+}
+
+static PyObject *
unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
{
Py_ssize_t len = 0;
@@ -15808,6 +15829,19 @@ PyTypeObject PyUnicodeIter_Type = {
0,
};
+PyTypeObject _PyUnicodeASCIIIter_Type = {
+ PyVarObject_HEAD_INIT(&PyType_Type, 0)
+ .tp_name = "str_ascii_iterator",
+ .tp_basicsize = sizeof(unicodeiterobject),
+ .tp_dealloc = (destructor)unicodeiter_dealloc,
+ .tp_getattro = PyObject_GenericGetAttr,
+ .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
+ .tp_traverse = (traverseproc)unicodeiter_traverse,
+ .tp_iter = PyObject_SelfIter,
+ .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
+ .tp_methods = unicodeiter_methods,
+};
+
static PyObject *
unicode_iter(PyObject *seq)
{
@@ -15819,7 +15853,12 @@ unicode_iter(PyObject *seq)
}
if (PyUnicode_READY(seq) == -1)
return NULL;
- it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
+ if (PyUnicode_IS_COMPACT_ASCII(seq)) {
+ it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
+ }
+ else {
+ it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
+ }
if (it == NULL)
return NULL;
it->it_index = 0;