summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/xml.etree.elementtree.rst45
-rw-r--r--Lib/test/test_xml_etree.py130
-rw-r--r--Lib/xml/etree/ElementTree.py203
-rw-r--r--Misc/NEWS3
4 files changed, 272 insertions, 109 deletions
diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst
index 6597a25..da03764 100644
--- a/Doc/library/xml.etree.elementtree.rst
+++ b/Doc/library/xml.etree.elementtree.rst
@@ -397,6 +397,9 @@ Functions
If you need a fully populated element, look for "end" events instead.
+ .. note::
+ For real event-driven parsing, see :class:`IncrementalParser`.
+
.. function:: parse(source, parser=None)
@@ -833,6 +836,48 @@ QName Objects
:class:`QName` instances are opaque.
+IncrementalParser Objects
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+.. class:: IncrementalParser(events=None, parser=None)
+
+ An incremental, event-driven parser suitable for non-blocking applications.
+ *events* is a list of events to report back. The supported events are the
+ strings ``"start"``, ``"end"``, ``"start-ns"`` and ``"end-ns"`` (the "ns"
+ events are used to get detailed namespace information). If *events* is
+ omitted, only ``"end"`` events are reported. *parser* is an optional
+ parser instance. If not given, the standard :class:`XMLParser` parser is
+ used.
+
+ .. method:: data_received(data)
+
+ Feed the given bytes data to the incremental parser.
+
+ .. method:: eof_received()
+
+ Signal the incremental parser that the data stream is terminated.
+
+ .. method:: events()
+
+ Iterate over the events which have been encountered in the data fed
+ to the parser. This method yields ``(event, elem)`` pairs, where
+ *event* is a string representing the type of event (e.g. ``"end"``)
+ and *elem* is the encountered :class:`Element` object.
+
+ .. note::
+
+ :class:`IncrementalParser` only guarantees that it has seen the ">"
+ character of a starting tag when it emits a "start" event, so the
+ attributes are defined, but the contents of the text and tail attributes
+ are undefined at that point. The same applies to the element children;
+ they may or may not be present.
+
+ If you need a fully populated element, look for "end" events instead.
+
+ .. versionadded:: 3.4
+
+
.. _elementtree-treebuilder-objects:
TreeBuilder Objects
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 001c13d..49a6914 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -903,6 +903,134 @@ class ElementTreeTest(unittest.TestCase):
self.assertEqual(serialized, expected)
+class IncrementalParserTest(unittest.TestCase):
+
+ def _feed(self, parser, data, chunk_size=None):
+ if chunk_size is None:
+ parser.data_received(data)
+ else:
+ for i in range(0, len(data), chunk_size):
+ parser.data_received(data[i:i+chunk_size])
+
+ def assert_event_tags(self, parser, expected):
+ events = parser.events()
+ self.assertEqual([(action, elem.tag) for action, elem in events],
+ expected)
+
+ def test_simple_xml(self):
+ for chunk_size in (None, 1, 5):
+ with self.subTest(chunk_size=chunk_size):
+ parser = ET.IncrementalParser()
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<!-- comment -->\n", chunk_size)
+ self.assert_event_tags(parser, [])
+ self._feed(parser,
+ "<root>\n <element key='value'>text</element",
+ chunk_size)
+ self.assert_event_tags(parser, [])
+ self._feed(parser, ">\n", chunk_size)
+ self.assert_event_tags(parser, [('end', 'element')])
+ self._feed(parser, "<element>text</element>tail\n", chunk_size)
+ self._feed(parser, "<empty-element/>\n", chunk_size)
+ self.assert_event_tags(parser, [
+ ('end', 'element'),
+ ('end', 'empty-element'),
+ ])
+ self._feed(parser, "</root>\n", chunk_size)
+ self.assert_event_tags(parser, [('end', 'root')])
+ # Receiving EOF sets the `root` attribute
+ self.assertIs(parser.root, None)
+ parser.eof_received()
+ self.assertEqual(parser.root.tag, 'root')
+
+ def test_data_received_while_iterating(self):
+ parser = ET.IncrementalParser()
+ it = parser.events()
+ self._feed(parser, "<root>\n <element key='value'>text</element>\n")
+ action, elem = next(it)
+ self.assertEqual((action, elem.tag), ('end', 'element'))
+ self._feed(parser, "</root>\n")
+ action, elem = next(it)
+ self.assertEqual((action, elem.tag), ('end', 'root'))
+ with self.assertRaises(StopIteration):
+ next(it)
+
+ def test_simple_xml_with_ns(self):
+ parser = ET.IncrementalParser()
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<!-- comment -->\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<root xmlns='namespace'>\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<element key='value'>text</element")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, ">\n")
+ self.assert_event_tags(parser, [('end', '{namespace}element')])
+ self._feed(parser, "<element>text</element>tail\n")
+ self._feed(parser, "<empty-element/>\n")
+ self.assert_event_tags(parser, [
+ ('end', '{namespace}element'),
+ ('end', '{namespace}empty-element'),
+ ])
+ self._feed(parser, "</root>\n")
+ self.assert_event_tags(parser, [('end', '{namespace}root')])
+ # Receiving EOF sets the `root` attribute
+ self.assertIs(parser.root, None)
+ parser.eof_received()
+ self.assertEqual(parser.root.tag, '{namespace}root')
+
+ def test_events(self):
+ parser = ET.IncrementalParser(events=())
+ self._feed(parser, "<root/>\n")
+ self.assert_event_tags(parser, [])
+
+ parser = ET.IncrementalParser(events=('start', 'end'))
+ self._feed(parser, "<!-- comment -->\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<root>\n")
+ self.assert_event_tags(parser, [('start', 'root')])
+ self._feed(parser, "<element key='value'>text</element")
+ self.assert_event_tags(parser, [('start', 'element')])
+ self._feed(parser, ">\n")
+ self.assert_event_tags(parser, [('end', 'element')])
+ self._feed(parser,
+ "<element xmlns='foo'>text<empty-element/></element>tail\n")
+ self.assert_event_tags(parser, [
+ ('start', '{foo}element'),
+ ('start', '{foo}empty-element'),
+ ('end', '{foo}empty-element'),
+ ('end', '{foo}element'),
+ ])
+ self._feed(parser, "</root>")
+ parser.eof_received()
+ self.assertIs(parser.root, None)
+ self.assert_event_tags(parser, [('end', 'root')])
+ self.assertEqual(parser.root.tag, 'root')
+
+ parser = ET.IncrementalParser(events=('start',))
+ self._feed(parser, "<!-- comment -->\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<root>\n")
+ self.assert_event_tags(parser, [('start', 'root')])
+ self._feed(parser, "<element key='value'>text</element")
+ self.assert_event_tags(parser, [('start', 'element')])
+ self._feed(parser, ">\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser,
+ "<element xmlns='foo'>text<empty-element/></element>tail\n")
+ self.assert_event_tags(parser, [
+ ('start', '{foo}element'),
+ ('start', '{foo}empty-element'),
+ ])
+ self._feed(parser, "</root>")
+ parser.eof_received()
+ self.assertEqual(parser.root.tag, 'root')
+
+ def test_unknown_event(self):
+ with self.assertRaises(ValueError):
+ ET.IncrementalParser(events=('start', 'end', 'bogus'))
+
+
#
# xinclude tests (samples from appendix C of the xinclude specification)
@@ -1406,6 +1534,7 @@ class BugsTest(unittest.TestCase):
ET.register_namespace('test10777', 'http://myuri/')
ET.register_namespace('test10777', 'http://myuri/')
+
# --------------------------------------------------------------------
@@ -2301,6 +2430,7 @@ def test_main(module=None):
ElementSlicingTest,
BasicElementTest,
ElementTreeTest,
+ IncrementalParserTest,
IOTest,
ParseErrorTest,
XIncludeTest,
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
index 9fd6e5e..9b42b37 100644
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -1216,84 +1216,85 @@ def iterparse(source, events=None, parser=None):
if not hasattr(source, "read"):
source = open(source, "rb")
close_source = True
- if not parser:
- parser = XMLParser(target=TreeBuilder())
return _IterParseIterator(source, events, parser, close_source)
-class _IterParseIterator:
- def __init__(self, source, events, parser, close_source=False):
- self._file = source
- self._close_file = close_source
- self._events = []
+class IncrementalParser:
+
+ def __init__(self, events=None, parser=None):
+ # _elementtree.c expects a list, not a deque
+ self._events_queue = []
self._index = 0
- self._error = None
self.root = self._root = None
+ if not parser:
+ parser = XMLParser(target=TreeBuilder())
self._parser = parser
# wire up the parser for event reporting
- parser = self._parser._parser
- append = self._events.append
if events is None:
- events = ["end"]
- for event in events:
- if event == "start":
- try:
- parser.ordered_attributes = 1
- parser.specified_attributes = 1
- def handler(tag, attrib_in, event=event, append=append,
- start=self._parser._start_list):
- append((event, start(tag, attrib_in)))
- parser.StartElementHandler = handler
- except AttributeError:
- def handler(tag, attrib_in, event=event, append=append,
- start=self._parser._start):
- append((event, start(tag, attrib_in)))
- parser.StartElementHandler = handler
- elif event == "end":
- def handler(tag, event=event, append=append,
- end=self._parser._end):
- append((event, end(tag)))
- parser.EndElementHandler = handler
- elif event == "start-ns":
- def handler(prefix, uri, event=event, append=append):
- append((event, (prefix or "", uri or "")))
- parser.StartNamespaceDeclHandler = handler
- elif event == "end-ns":
- def handler(prefix, event=event, append=append):
- append((event, None))
- parser.EndNamespaceDeclHandler = handler
+ events = ("end",)
+ self._parser._setevents(self._events_queue, events)
+
+ def data_received(self, data):
+ if self._parser is None:
+ raise ValueError("data_received() called after end of stream")
+ if data:
+ try:
+ self._parser.feed(data)
+ except SyntaxError as exc:
+ self._events_queue.append(exc)
+
+ def eof_received(self):
+ self._root = self._parser.close()
+ self._parser = None
+ if self._index >= len(self._events_queue):
+ self.root = self._root
+
+ def events(self):
+ events = self._events_queue
+ while True:
+ index = self._index
+ try:
+ event = events[self._index]
+ # Avoid retaining references to past events
+ events[self._index] = None
+ except IndexError:
+ break
+ index += 1
+ # Compact the list in a O(1) amortized fashion
+ if index * 2 >= len(events):
+ events[:index] = []
+ self._index = 0
else:
- raise ValueError("unknown event %r" % event)
+ self._index = index
+ if isinstance(event, Exception):
+ raise event
+ else:
+ yield event
+ if self._parser is None:
+ self.root = self._root
+
+
+class _IterParseIterator(IncrementalParser):
+
+ def __init__(self, source, events, parser, close_source=False):
+ IncrementalParser.__init__(self, events, parser)
+ self._file = source
+ self._close_file = close_source
def __next__(self):
while 1:
- try:
- item = self._events[self._index]
- self._index += 1
- return item
- except IndexError:
- pass
- if self._error:
- e = self._error
- self._error = None
- raise e
+ for event in self.events():
+ return event
if self._parser is None:
- self.root = self._root
if self._close_file:
self._file.close()
raise StopIteration
# load event buffer
- del self._events[:]
- self._index = 0
data = self._file.read(16384)
if data:
- try:
- self._parser.feed(data)
- except SyntaxError as exc:
- self._error = exc
+ self.data_received(data)
else:
- self._root = self._parser.close()
- self._parser = None
+ self.eof_received()
def __iter__(self):
return self
@@ -1498,6 +1499,40 @@ class XMLParser:
except AttributeError:
pass # unknown
+ def _setevents(self, event_list, events):
+ # Internal API for IncrementalParser
+ parser = self._parser
+ append = event_list.append
+ for event in events:
+ if event == "start":
+ try:
+ parser.ordered_attributes = 1
+ parser.specified_attributes = 1
+ def handler(tag, attrib_in, event=event, append=append,
+ start=self._start_list):
+ append((event, start(tag, attrib_in)))
+ parser.StartElementHandler = handler
+ except AttributeError:
+ def handler(tag, attrib_in, event=event, append=append,
+ start=self._start):
+ append((event, start(tag, attrib_in)))
+ parser.StartElementHandler = handler
+ elif event == "end":
+ def handler(tag, event=event, append=append,
+ end=self._end):
+ append((event, end(tag)))
+ parser.EndElementHandler = handler
+ elif event == "start-ns":
+ def handler(prefix, uri, event=event, append=append):
+ append((event, (prefix or "", uri or "")))
+ parser.StartNamespaceDeclHandler = handler
+ elif event == "end-ns":
+ def handler(prefix, event=event, append=append):
+ append((event, None))
+ parser.EndNamespaceDeclHandler = handler
+ else:
+ raise ValueError("unknown event %r" % event)
+
def _raiseerror(self, value):
err = ParseError(value)
err.code = value.code
@@ -1635,7 +1670,7 @@ try:
except ImportError:
pass
else:
- # Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser
+ # Overwrite 'ElementTree.parse' to use the C XMLParser
class ElementTree(ElementTree):
__doc__ = ElementTree.__doc__
@@ -1661,56 +1696,6 @@ else:
if close_source:
source.close()
- class iterparse:
- __doc__ = iterparse.__doc__
- root = None
- def __init__(self, source, events=None, parser=None):
- self._close_file = False
- if not hasattr(source, 'read'):
- source = open(source, 'rb')
- self._close_file = True
- self._file = source
- self._events = []
- self._index = 0
- self._error = None
- self.root = self._root = None
- if parser is None:
- parser = XMLParser(target=TreeBuilder())
- self._parser = parser
- self._parser._setevents(self._events, events)
-
- def __next__(self):
- while True:
- try:
- item = self._events[self._index]
- self._index += 1
- return item
- except IndexError:
- pass
- if self._error:
- e = self._error
- self._error = None
- raise e
- if self._parser is None:
- self.root = self._root
- if self._close_file:
- self._file.close()
- raise StopIteration
- # load event buffer
- del self._events[:]
- self._index = 0
- data = self._file.read(16384)
- if data:
- try:
- self._parser.feed(data)
- except SyntaxError as exc:
- self._error = exc
- else:
- self._root = self._parser.close()
- self._parser = None
-
- def __iter__(self):
- return self
# compatibility
XMLTreeBuilder = XMLParser
diff --git a/Misc/NEWS b/Misc/NEWS
index ddb5955..098d328 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -45,6 +45,9 @@ Core and Builtins
Library
-------
+- Issue #17741: Add ElementTree.IncrementalParser, an event-driven parser
+ for non-blocking applications.
+
- Issue #17555: Fix ForkAwareThreadLock so that size of after fork
registry does not grow exponentially with generation of process.