summaryrefslogtreecommitdiffstats
path: root/Modules/_elementtree.c
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-12-21 10:43:54 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2015-12-21 10:43:54 (GMT)
commit22adf2ac022d6667a8689d6fac8af44c7663e3f2 (patch)
treec75f6234a3b747a747ab3c9aaa738fc0139c85db /Modules/_elementtree.c
parent47a9d59d5115fc2d9c29035d9ae4418eb1800f44 (diff)
downloadcpython-22adf2ac022d6667a8689d6fac8af44c7663e3f2.zip
cpython-22adf2ac022d6667a8689d6fac8af44c7663e3f2.tar.gz
cpython-22adf2ac022d6667a8689d6fac8af44c7663e3f2.tar.bz2
Issue #25873: Optimized iterating ElementTree.
Iterating elements Element.iter() is now 40% faster, iterating text Element.itertext() is now up to 2.5 times faster.
Diffstat (limited to 'Modules/_elementtree.c')
-rw-r--r--Modules/_elementtree.c259
1 files changed, 101 insertions, 158 deletions
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
index 949e3a0..ad3e45c 100644
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -1986,22 +1986,22 @@ static PySequenceMethods element_as_sequence = {
* pre-order traversal. To keep track of which sub-element should be returned
* next, a stack of parents is maintained. This is a standard stack-based
* iterative pre-order traversal of a tree.
- * The stack is managed using a single-linked list starting at parent_stack.
- * Each stack node contains the saved parent to which we should return after
+ * The stack is managed using a continuous array.
+ * Each stack item contains the saved parent to which we should return after
* the current one is exhausted, and the next child to examine in that parent.
*/
typedef struct ParentLocator_t {
ElementObject *parent;
Py_ssize_t child_index;
- struct ParentLocator_t *next;
} ParentLocator;
typedef struct {
PyObject_HEAD
ParentLocator *parent_stack;
+ Py_ssize_t parent_stack_used;
+ Py_ssize_t parent_stack_size;
ElementObject *root_element;
PyObject *sought_tag;
- int root_done;
int gettext;
} ElementIterObject;
@@ -2009,13 +2009,11 @@ typedef struct {
static void
elementiter_dealloc(ElementIterObject *it)
{
- ParentLocator *p = it->parent_stack;
- while (p) {
- ParentLocator *temp = p;
- Py_XDECREF(p->parent);
- p = p->next;
- PyObject_Free(temp);
- }
+ Py_ssize_t i = it->parent_stack_used;
+ it->parent_stack_used = 0;
+ while (i--)
+ Py_XDECREF(it->parent_stack[i].parent);
+ PyMem_Free(it->parent_stack);
Py_XDECREF(it->sought_tag);
Py_XDECREF(it->root_element);
@@ -2027,11 +2025,9 @@ elementiter_dealloc(ElementIterObject *it)
static int
elementiter_traverse(ElementIterObject *it, visitproc visit, void *arg)
{
- ParentLocator *p = it->parent_stack;
- while (p) {
- Py_VISIT(p->parent);
- p = p->next;
- }
+ Py_ssize_t i = it->parent_stack_used;
+ while (i--)
+ Py_VISIT(it->parent_stack[i].parent);
Py_VISIT(it->root_element);
Py_VISIT(it->sought_tag);
@@ -2040,17 +2036,25 @@ elementiter_traverse(ElementIterObject *it, visitproc visit, void *arg)
/* Helper function for elementiter_next. Add a new parent to the parent stack.
*/
-static ParentLocator *
-parent_stack_push_new(ParentLocator *stack, ElementObject *parent)
+static int
+parent_stack_push_new(ElementIterObject *it, ElementObject *parent)
{
- ParentLocator *new_node = PyObject_Malloc(sizeof(ParentLocator));
- if (new_node) {
- new_node->parent = parent;
- Py_INCREF(parent);
- new_node->child_index = 0;
- new_node->next = stack;
+ ParentLocator *item;
+
+ if (it->parent_stack_used >= it->parent_stack_size) {
+ Py_ssize_t new_size = it->parent_stack_size * 2; /* never overflow */
+ ParentLocator *parent_stack = it->parent_stack;
+ PyMem_Resize(parent_stack, ParentLocator, new_size);
+ if (parent_stack == NULL)
+ return -1;
+ it->parent_stack = parent_stack;
+ it->parent_stack_size = new_size;
}
- return new_node;
+ item = it->parent_stack + it->parent_stack_used++;
+ Py_INCREF(parent);
+ item->parent = parent;
+ item->child_index = 0;
+ return 0;
}
static PyObject *
@@ -2067,151 +2071,91 @@ elementiter_next(ElementIterObject *it)
* - itertext() also has to handle tail, after finishing with all the
* children of a node.
*/
- ElementObject *cur_parent;
- Py_ssize_t child_index;
int rc;
ElementObject *elem;
+ PyObject *text;
while (1) {
/* Handle the case reached in the beginning and end of iteration, where
- * the parent stack is empty. The root_done flag gives us indication
- * whether we've just started iterating (so root_done is 0), in which
- * case the root is returned. If root_done is 1 and we're here, the
+ * the parent stack is empty. If root_element is NULL and we're here, the
* iterator is exhausted.
*/
- if (!it->parent_stack->parent) {
- if (it->root_done) {
+ if (!it->parent_stack_used) {
+ if (!it->root_element) {
PyErr_SetNone(PyExc_StopIteration);
return NULL;
- } else {
- elem = it->root_element;
- it->parent_stack = parent_stack_push_new(it->parent_stack,
- elem);
- if (!it->parent_stack) {
- PyErr_NoMemory();
- return NULL;
- }
+ }
- Py_INCREF(elem);
- it->root_done = 1;
- rc = (it->sought_tag == Py_None);
- if (!rc) {
- rc = PyObject_RichCompareBool(elem->tag,
- it->sought_tag, Py_EQ);
- if (rc < 0) {
- Py_DECREF(elem);
- return NULL;
- }
- }
- if (rc) {
- if (it->gettext) {
- PyObject *text = element_get_text(elem);
- if (!text) {
- Py_DECREF(elem);
- return NULL;
- }
- Py_INCREF(text);
- Py_DECREF(elem);
- rc = PyObject_IsTrue(text);
- if (rc > 0)
- return text;
- Py_DECREF(text);
- if (rc < 0)
- return NULL;
- } else {
- return (PyObject *)elem;
- }
- }
- else {
- Py_DECREF(elem);
+ elem = it->root_element; /* steals a reference */
+ it->root_element = NULL;
+ }
+ else {
+ /* See if there are children left to traverse in the current parent. If
+ * yes, visit the next child. If not, pop the stack and try again.
+ */
+ ParentLocator *item = &it->parent_stack[it->parent_stack_used - 1];
+ Py_ssize_t child_index = item->child_index;
+ ElementObjectExtra *extra;
+ elem = item->parent;
+ extra = elem->extra;
+ if (!extra || child_index >= extra->length) {
+ it->parent_stack_used--;
+ /* Note that extra condition on it->parent_stack_used here;
+ * this is because itertext() is supposed to only return *inner*
+ * text, not text following the element it began iteration with.
+ */
+ if (it->gettext && it->parent_stack_used) {
+ text = element_get_tail(elem);
+ goto gettext;
}
+ Py_DECREF(elem);
+ continue;
}
+
+ elem = (ElementObject *)extra->children[child_index];
+ item->child_index++;
+ Py_INCREF(elem);
}
- /* See if there are children left to traverse in the current parent. If
- * yes, visit the next child. If not, pop the stack and try again.
- */
- cur_parent = it->parent_stack->parent;
- child_index = it->parent_stack->child_index;
- if (cur_parent->extra && child_index < cur_parent->extra->length) {
- elem = (ElementObject *)cur_parent->extra->children[child_index];
- it->parent_stack->child_index++;
- it->parent_stack = parent_stack_push_new(it->parent_stack,
- elem);
- if (!it->parent_stack) {
- PyErr_NoMemory();
- return NULL;
- }
+ if (parent_stack_push_new(it, elem) < 0) {
+ Py_DECREF(elem);
+ PyErr_NoMemory();
+ return NULL;
+ }
+ if (it->gettext) {
+ text = element_get_text(elem);
+ goto gettext;
+ }
- Py_INCREF(elem);
- if (it->gettext) {
- PyObject *text = element_get_text(elem);
- if (!text) {
- Py_DECREF(elem);
- return NULL;
- }
- Py_INCREF(text);
- Py_DECREF(elem);
- rc = PyObject_IsTrue(text);
- if (rc > 0)
- return text;
- Py_DECREF(text);
- if (rc < 0)
- return NULL;
- } else {
- rc = (it->sought_tag == Py_None);
- if (!rc) {
- rc = PyObject_RichCompareBool(elem->tag,
- it->sought_tag, Py_EQ);
- if (rc < 0) {
- Py_DECREF(elem);
- return NULL;
- }
- }
- if (rc) {
- return (PyObject *)elem;
- }
- Py_DECREF(elem);
- }
+ if (it->sought_tag == Py_None)
+ return (PyObject *)elem;
+
+ rc = PyObject_RichCompareBool(elem->tag, it->sought_tag, Py_EQ);
+ if (rc > 0)
+ return (PyObject *)elem;
+
+ Py_DECREF(elem);
+ if (rc < 0)
+ return NULL;
+ continue;
+
+gettext:
+ if (!text) {
+ Py_DECREF(elem);
+ return NULL;
+ }
+ if (text == Py_None) {
+ Py_DECREF(elem);
}
else {
- PyObject *tail;
- ParentLocator *next;
- if (it->gettext) {
- Py_INCREF(cur_parent);
- tail = element_get_tail(cur_parent);
- if (!tail) {
- Py_DECREF(cur_parent);
- return NULL;
- }
- Py_INCREF(tail);
- Py_DECREF(cur_parent);
- }
- else {
- tail = Py_None;
- Py_INCREF(tail);
- }
- next = it->parent_stack->next;
- cur_parent = it->parent_stack->parent;
- PyObject_Free(it->parent_stack);
- it->parent_stack = next;
- Py_XDECREF(cur_parent);
-
- /* Note that extra condition on it->parent_stack->parent here;
- * this is because itertext() is supposed to only return *inner*
- * text, not text following the element it began iteration with.
- */
- if (it->parent_stack->parent) {
- rc = PyObject_IsTrue(tail);
- if (rc > 0)
- return tail;
- Py_DECREF(tail);
- if (rc < 0)
- return NULL;
- }
- else {
- Py_DECREF(tail);
- }
+ Py_INCREF(text);
+ Py_DECREF(elem);
+ rc = PyObject_IsTrue(text);
+ if (rc > 0)
+ return text;
+ Py_DECREF(text);
+ if (rc < 0)
+ return NULL;
}
}
@@ -2263,6 +2207,7 @@ static PyTypeObject ElementIter_Type = {
0, /* tp_new */
};
+#define INIT_PARENT_STACK_SIZE 8
static PyObject *
create_elementiter(ElementObject *self, PyObject *tag, int gettext)
@@ -2275,22 +2220,20 @@ create_elementiter(ElementObject *self, PyObject *tag, int gettext)
Py_INCREF(tag);
it->sought_tag = tag;
- it->root_done = 0;
it->gettext = gettext;
Py_INCREF(self);
it->root_element = self;
PyObject_GC_Track(it);
- it->parent_stack = PyObject_Malloc(sizeof(ParentLocator));
+ it->parent_stack = PyMem_New(ParentLocator, INIT_PARENT_STACK_SIZE);
if (it->parent_stack == NULL) {
Py_DECREF(it);
PyErr_NoMemory();
return NULL;
}
- it->parent_stack->parent = NULL;
- it->parent_stack->child_index = 0;
- it->parent_stack->next = NULL;
+ it->parent_stack_used = 0;
+ it->parent_stack_size = INIT_PARENT_STACK_SIZE;
return (PyObject *)it;
}