From 9ec5e25f26a490510bb5da5c26a276cd30a263a0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 7 Dec 2015 02:31:11 +0200 Subject: Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster. ElementTree.XMLParser._setevents now accepts any objects with the append method, not just a list. --- Lib/xml/etree/ElementTree.py | 92 ++++++++++++++--------------------------- Misc/NEWS | 2 + Modules/_elementtree.c | 35 +++++++++------- Modules/clinic/_elementtree.c.h | 7 ++-- 4 files changed, 56 insertions(+), 80 deletions(-) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 62b5d3a..a58ef31 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -95,6 +95,7 @@ import sys import re import warnings import io +import collections import contextlib from . import ElementPath @@ -1198,16 +1199,37 @@ def iterparse(source, events=None, parser=None): Returns an iterator providing (event, elem) pairs. """ + # Use the internal, undocumented _parser argument for now; When the + # parser argument of iterparse is removed, this can be killed. + pullparser = XMLPullParser(events=events, _parser=parser) + def iterator(): + try: + while True: + yield from pullparser.read_events() + # load event buffer + data = source.read(16 * 1024) + if not data: + break + pullparser.feed(data) + root = pullparser._close_and_return_root() + yield from pullparser.read_events() + it.root = root + finally: + if close_source: + source.close() + + class IterParseIterator(collections.Iterator): + __next__ = iterator().__next__ + it = IterParseIterator() + it.root = None + del iterator, IterParseIterator + close_source = False if not hasattr(source, "read"): source = open(source, "rb") close_source = True - try: - return _IterParseIterator(source, events, parser, close_source) - except: - if close_source: - source.close() - raise + + return it class XMLPullParser: @@ -1217,9 +1239,7 @@ class XMLPullParser: # upon in user code. It will be removed in a future release. # See http://bugs.python.org/issue17741 for more details. - # _elementtree.c expects a list, not a deque - self._events_queue = [] - self._index = 0 + self._events_queue = collections.deque() self._parser = _parser or XMLParser(target=TreeBuilder()) # wire up the parser for event reporting if events is None: @@ -1257,64 +1277,14 @@ class XMLPullParser: retrieved from the iterator. """ events = self._events_queue - while True: - index = self._index - try: - event = events[self._index] - # Avoid retaining references to past events - events[self._index] = None - except IndexError: - break - index += 1 - # Compact the list in a O(1) amortized fashion - # As noted above, _elementree.c needs a list, not a deque - if index * 2 >= len(events): - events[:index] = [] - self._index = 0 - else: - self._index = index + while events: + event = events.popleft() if isinstance(event, Exception): raise event else: yield event -class _IterParseIterator: - - def __init__(self, source, events, parser, close_source=False): - # Use the internal, undocumented _parser argument for now; When the - # parser argument of iterparse is removed, this can be killed. - self._parser = XMLPullParser(events=events, _parser=parser) - self._file = source - self._close_file = close_source - self.root = self._root = None - - def __next__(self): - try: - while 1: - for event in self._parser.read_events(): - return event - if self._parser._parser is None: - break - # load event buffer - data = self._file.read(16 * 1024) - if data: - self._parser.feed(data) - else: - self._root = self._parser._close_and_return_root() - self.root = self._root - except: - if self._close_file: - self._file.close() - raise - if self._close_file: - self._file.close() - raise StopIteration - - def __iter__(self): - return self - - def XML(text, parser=None): """Parse XML document from string constant. diff --git a/Misc/NEWS b/Misc/NEWS index 578d2a3..d235f64 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -109,6 +109,8 @@ Core and Builtins Library ------- +- Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster. + - Issue #25761: Improved detecting errors in broken pickle data. - Issue #25717: Restore the previous behaviour of tolerating most fstat() diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index 9026585..168c7d6 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -2289,7 +2289,7 @@ typedef struct { PyObject *element_factory; /* element tracing */ - PyObject *events; /* list of events, or NULL if not collecting */ + PyObject *events_append; /* the append method of the list of events, or NULL */ PyObject *start_event_obj; /* event objects (NULL to ignore) */ PyObject *end_event_obj; PyObject *start_ns_event_obj; @@ -2324,7 +2324,7 @@ treebuilder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) } t->index = 0; - t->events = NULL; + t->events_append = NULL; t->start_event_obj = t->end_event_obj = NULL; t->start_ns_event_obj = t->end_ns_event_obj = NULL; } @@ -2374,7 +2374,7 @@ treebuilder_gc_clear(TreeBuilderObject *self) Py_CLEAR(self->start_ns_event_obj); Py_CLEAR(self->end_event_obj); Py_CLEAR(self->start_event_obj); - Py_CLEAR(self->events); + Py_CLEAR(self->events_append); Py_CLEAR(self->stack); Py_CLEAR(self->data); Py_CLEAR(self->last); @@ -2455,13 +2455,14 @@ treebuilder_append_event(TreeBuilderObject *self, PyObject *action, PyObject *node) { if (action != NULL) { - PyObject *res = PyTuple_Pack(2, action, node); - if (res == NULL) + PyObject *res; + PyObject *event = PyTuple_Pack(2, action, node); + if (event == NULL) return -1; - if (PyList_Append(self->events, res) < 0) { - Py_DECREF(res); + res = PyObject_CallFunctionObjArgs(self->events_append, event, NULL); + Py_DECREF(event); + if (res == NULL) return -1; - } Py_DECREF(res); } return 0; @@ -3039,7 +3040,7 @@ expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix, if (PyErr_Occurred()) return; - if (!target->events || !target->start_ns_event_obj) + if (!target->events_append || !target->start_ns_event_obj) return; if (!uri) @@ -3062,7 +3063,7 @@ expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in) if (PyErr_Occurred()) return; - if (!target->events) + if (!target->events_append) return; treebuilder_append_event(target, target->end_ns_event_obj, Py_None); @@ -3551,7 +3552,7 @@ _elementtree_XMLParser_doctype_impl(XMLParserObject *self, PyObject *name, /*[clinic input] _elementtree.XMLParser._setevents - events_queue: object(subclass_of='&PyList_Type') + events_queue: object events_to_report: object = None / @@ -3561,12 +3562,12 @@ static PyObject * _elementtree_XMLParser__setevents_impl(XMLParserObject *self, PyObject *events_queue, PyObject *events_to_report) -/*[clinic end generated code: output=1440092922b13ed1 input=59db9742910c6174]*/ +/*[clinic end generated code: output=1440092922b13ed1 input=abf90830a1c3b0fc]*/ { /* activate element event reporting */ Py_ssize_t i, seqlen; TreeBuilderObject *target; - PyObject *events_seq; + PyObject *events_append, *events_seq; if (!TreeBuilder_CheckExact(self->target)) { PyErr_SetString( @@ -3579,9 +3580,11 @@ _elementtree_XMLParser__setevents_impl(XMLParserObject *self, target = (TreeBuilderObject*) self->target; - Py_INCREF(events_queue); - Py_XDECREF(target->events); - target->events = events_queue; + events_append = PyObject_GetAttrString(events_queue, "append"); + if (events_append == NULL) + return NULL; + Py_XDECREF(target->events_append); + target->events_append = events_append; /* clear out existing events */ Py_CLEAR(target->start_event_obj); diff --git a/Modules/clinic/_elementtree.c.h b/Modules/clinic/_elementtree.c.h index 86b4c4c..92e98cf 100644 --- a/Modules/clinic/_elementtree.c.h +++ b/Modules/clinic/_elementtree.c.h @@ -668,12 +668,13 @@ _elementtree_XMLParser__setevents(XMLParserObject *self, PyObject *args) PyObject *events_queue; PyObject *events_to_report = Py_None; - if (!PyArg_ParseTuple(args, "O!|O:_setevents", - &PyList_Type, &events_queue, &events_to_report)) + if (!PyArg_UnpackTuple(args, "_setevents", + 1, 2, + &events_queue, &events_to_report)) goto exit; return_value = _elementtree_XMLParser__setevents_impl(self, events_queue, events_to_report); exit: return return_value; } -/*[clinic end generated code: output=25b8bf7e7f2151ca input=a9049054013a1b77]*/ +/*[clinic end generated code: output=19d94e2d2726d3aa input=a9049054013a1b77]*/ -- cgit v0.12