summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2019-05-01 19:49:58 (GMT)
committerGitHub <noreply@github.com>2019-05-01 19:49:58 (GMT)
commitdde3eebdaa8d2c51971ca704d53af7cbcda8bb34 (patch)
treeff16947548ec92506e63f98bbf79d9ad7af296a8
parent43851a202cabce1e6be699e7177735c778b6697e (diff)
downloadcpython-dde3eebdaa8d2c51971ca704d53af7cbcda8bb34.zip
cpython-dde3eebdaa8d2c51971ca704d53af7cbcda8bb34.tar.gz
cpython-dde3eebdaa8d2c51971ca704d53af7cbcda8bb34.tar.bz2
bpo-36676: Namespace prefix aware parsing support for the ET.XMLParser target (GH-12885)
* bpo-36676: Implement namespace prefix aware parsing support for the XMLParser target in ElementTree.
-rw-r--r--Doc/library/xml.etree.elementtree.rst22
-rw-r--r--Lib/test/test_xml_etree.py93
-rw-r--r--Lib/xml/etree/ElementTree.py30
-rw-r--r--Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst3
-rw-r--r--Modules/_elementtree.c140
5 files changed, 258 insertions, 30 deletions
diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst
index c9e04c2..66090af 100644
--- a/Doc/library/xml.etree.elementtree.rst
+++ b/Doc/library/xml.etree.elementtree.rst
@@ -1086,7 +1086,7 @@ TreeBuilder Objects
In addition, a custom :class:`TreeBuilder` object can provide the
- following method:
+ following methods:
.. method:: doctype(name, pubid, system)
@@ -1096,6 +1096,23 @@ TreeBuilder Objects
.. versionadded:: 3.2
+ .. method:: start_ns(prefix, uri)
+
+ Is called whenever the parser encounters a new namespace declaration,
+ before the ``start()`` callback for the opening element that defines it.
+ *prefix* is ``''`` for the default namespace and the declared
+ namespace prefix name otherwise. *uri* is the namespace URI.
+
+ .. versionadded:: 3.8
+
+ .. method:: end_ns(prefix)
+
+ Is called after the ``end()`` callback of an element that declared
+ a namespace prefix mapping, with the name of the *prefix* that went
+ out of scope.
+
+ .. versionadded:: 3.8
+
.. _elementtree-xmlparser-objects:
@@ -1131,7 +1148,8 @@ XMLParser Objects
:meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method
for each opening tag, its ``end(tag)`` method for each closing tag, and data
- is processed by method ``data(data)``. :meth:`XMLParser.close` calls
+ is processed by method ``data(data)``. For further supported callback
+ methods, see the :class:`TreeBuilder` class. :meth:`XMLParser.close` calls
*target*\'s method ``close()``. :class:`XMLParser` can be used not only for
building a tree structure. This is an example of counting the maximum depth
of an XML file::
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 8a228b8..0abc42a 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -14,12 +14,13 @@ import locale
import operator
import pickle
import sys
+import textwrap
import types
import unittest
import warnings
import weakref
-from itertools import product
+from itertools import product, islice
from test import support
from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr
@@ -694,12 +695,17 @@ class ElementTreeTest(unittest.TestCase):
self.append(("pi", target, data))
def comment(self, data):
self.append(("comment", data))
+ def start_ns(self, prefix, uri):
+ self.append(("start-ns", prefix, uri))
+ def end_ns(self, prefix):
+ self.append(("end-ns", prefix))
builder = Builder()
parser = ET.XMLParser(target=builder)
parser.feed(data)
self.assertEqual(builder, [
('pi', 'pi', 'data'),
('comment', ' comment '),
+ ('start-ns', '', 'namespace'),
('start', '{namespace}root'),
('start', '{namespace}element'),
('end', '{namespace}element'),
@@ -708,8 +714,30 @@ class ElementTreeTest(unittest.TestCase):
('start', '{namespace}empty-element'),
('end', '{namespace}empty-element'),
('end', '{namespace}root'),
+ ('end-ns', ''),
])
+ def test_custom_builder_only_end_ns(self):
+ class Builder(list):
+ def end_ns(self, prefix):
+ self.append(("end-ns", prefix))
+
+ builder = Builder()
+ parser = ET.XMLParser(target=builder)
+ parser.feed(textwrap.dedent("""\
+ <?pi data?>
+ <!-- comment -->
+ <root xmlns='namespace' xmlns:p='pns' xmlns:a='ans'>
+ <a:element key='value'>text</a:element>
+ <p:element>text</p:element>tail
+ <empty-element/>
+ </root>
+ """))
+ self.assertEqual(builder, [
+ ('end-ns', 'a'),
+ ('end-ns', 'p'),
+ ('end-ns', ''),
+ ])
# Element.getchildren() and ElementTree.getiterator() are deprecated.
@checkwarnings(("This method will be removed in future versions. "
@@ -1194,14 +1222,19 @@ class XMLPullParserTest(unittest.TestCase):
for i in range(0, len(data), chunk_size):
parser.feed(data[i:i+chunk_size])
- def assert_events(self, parser, expected):
+ def assert_events(self, parser, expected, max_events=None):
self.assertEqual(
[(event, (elem.tag, elem.text))
- for event, elem in parser.read_events()],
+ for event, elem in islice(parser.read_events(), max_events)],
expected)
- def assert_event_tags(self, parser, expected):
- events = parser.read_events()
+ def assert_event_tuples(self, parser, expected, max_events=None):
+ self.assertEqual(
+ list(islice(parser.read_events(), max_events)),
+ expected)
+
+ def assert_event_tags(self, parser, expected, max_events=None):
+ events = islice(parser.read_events(), max_events)
self.assertEqual([(action, elem.tag) for action, elem in events],
expected)
@@ -1276,6 +1309,56 @@ class XMLPullParserTest(unittest.TestCase):
self.assertEqual(list(parser.read_events()), [('end-ns', None)])
self.assertIsNone(parser.close())
+ def test_ns_events_start(self):
+ parser = ET.XMLPullParser(events=('start-ns', 'start', 'end'))
+ self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
+ self.assert_event_tuples(parser, [
+ ('start-ns', ('', 'abc')),
+ ('start-ns', ('p', 'xyz')),
+ ], max_events=2)
+ self.assert_event_tags(parser, [
+ ('start', '{abc}tag'),
+ ], max_events=1)
+
+ self._feed(parser, "<child />\n")
+ self.assert_event_tags(parser, [
+ ('start', '{abc}child'),
+ ('end', '{abc}child'),
+ ])
+
+ self._feed(parser, "</tag>\n")
+ parser.close()
+ self.assert_event_tags(parser, [
+ ('end', '{abc}tag'),
+ ])
+
+ def test_ns_events_start_end(self):
+ parser = ET.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns'))
+ self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
+ self.assert_event_tuples(parser, [
+ ('start-ns', ('', 'abc')),
+ ('start-ns', ('p', 'xyz')),
+ ], max_events=2)
+ self.assert_event_tags(parser, [
+ ('start', '{abc}tag'),
+ ], max_events=1)
+
+ self._feed(parser, "<child />\n")
+ self.assert_event_tags(parser, [
+ ('start', '{abc}child'),
+ ('end', '{abc}child'),
+ ])
+
+ self._feed(parser, "</tag>\n")
+ parser.close()
+ self.assert_event_tags(parser, [
+ ('end', '{abc}tag'),
+ ], max_events=1)
+ self.assert_event_tuples(parser, [
+ ('end-ns', None),
+ ('end-ns', None),
+ ])
+
def test_events(self):
parser = ET.XMLPullParser(events=())
self._feed(parser, "<root/>\n")
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
index c640048..5b26ac7 100644
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -1518,6 +1518,10 @@ class XMLParser:
parser.StartElementHandler = self._start
if hasattr(target, 'end'):
parser.EndElementHandler = self._end
+ if hasattr(target, 'start_ns'):
+ parser.StartNamespaceDeclHandler = self._start_ns
+ if hasattr(target, 'end_ns'):
+ parser.EndNamespaceDeclHandler = self._end_ns
if hasattr(target, 'data'):
parser.CharacterDataHandler = target.data
# miscellaneous callbacks
@@ -1559,12 +1563,24 @@ class XMLParser:
append((event, end(tag)))
parser.EndElementHandler = handler
elif event_name == "start-ns":
- def handler(prefix, uri, event=event_name, append=append):
- append((event, (prefix or "", uri or "")))
+ # TreeBuilder does not implement .start_ns()
+ if hasattr(self.target, "start_ns"):
+ def handler(prefix, uri, event=event_name, append=append,
+ start_ns=self._start_ns):
+ append((event, start_ns(prefix, uri)))
+ else:
+ def handler(prefix, uri, event=event_name, append=append):
+ append((event, (prefix or '', uri or '')))
parser.StartNamespaceDeclHandler = handler
elif event_name == "end-ns":
- def handler(prefix, event=event_name, append=append):
- append((event, None))
+ # TreeBuilder does not implement .end_ns()
+ if hasattr(self.target, "end_ns"):
+ def handler(prefix, event=event_name, append=append,
+ end_ns=self._end_ns):
+ append((event, end_ns(prefix)))
+ else:
+ def handler(prefix, event=event_name, append=append):
+ append((event, None))
parser.EndNamespaceDeclHandler = handler
elif event_name == 'comment':
def handler(text, event=event_name, append=append, self=self):
@@ -1595,6 +1611,12 @@ class XMLParser:
self._names[key] = name
return name
+ def _start_ns(self, prefix, uri):
+ return self.target.start_ns(prefix or '', uri or '')
+
+ def _end_ns(self, prefix):
+ return self.target.end_ns(prefix or '')
+
def _start(self, tag, attr_list):
# Handler for expat's StartElementHandler. Since ordered_attributes
# is set, the attributes are reported as a list of alternating
diff --git a/Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst b/Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst
new file mode 100644
index 0000000..e0bede8
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst
@@ -0,0 +1,3 @@
+The XMLParser() in xml.etree.ElementTree provides namespace prefix context to the
+parser target if it defines the callback methods "start_ns()" and/or "end_ns()".
+Patch by Stefan Behnel.
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
index 5481c61..b69e3a4 100644
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -2911,6 +2911,39 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
return NULL;
}
+LOCAL(PyObject*)
+treebuilder_handle_start_ns(TreeBuilderObject* self, PyObject* prefix, PyObject* uri)
+{
+ PyObject* parcel;
+
+ if (self->events_append && self->start_ns_event_obj) {
+ parcel = PyTuple_Pack(2, prefix, uri);
+ if (!parcel) {
+ return NULL;
+ }
+
+ if (treebuilder_append_event(self, self->start_ns_event_obj, parcel) < 0) {
+ Py_DECREF(parcel);
+ return NULL;
+ }
+ Py_DECREF(parcel);
+ }
+
+ Py_RETURN_NONE;
+}
+
+LOCAL(PyObject*)
+treebuilder_handle_end_ns(TreeBuilderObject* self, PyObject* prefix)
+{
+ if (self->events_append && self->end_ns_event_obj) {
+ if (treebuilder_append_event(self, self->end_ns_event_obj, prefix) < 0) {
+ return NULL;
+ }
+ }
+
+ Py_RETURN_NONE;
+}
+
/* -------------------------------------------------------------------- */
/* methods (in alphabetical order) */
@@ -3046,6 +3079,8 @@ typedef struct {
PyObject *names;
+ PyObject *handle_start_ns;
+ PyObject *handle_end_ns;
PyObject *handle_start;
PyObject *handle_data;
PyObject *handle_end;
@@ -3357,42 +3392,89 @@ expat_end_handler(XMLParserObject* self, const XML_Char* tag_in)
}
static void
-expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix,
- const XML_Char *uri)
+expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix_in,
+ const XML_Char *uri_in)
{
- TreeBuilderObject *target = (TreeBuilderObject*) self->target;
- PyObject *parcel;
+ PyObject* res = NULL;
+ PyObject* uri;
+ PyObject* prefix;
+ PyObject* stack[2];
if (PyErr_Occurred())
return;
- if (!target->events_append || !target->start_ns_event_obj)
- return;
+ if (!uri_in)
+ uri_in = "";
+ if (!prefix_in)
+ prefix_in = "";
- if (!uri)
- uri = "";
- if (!prefix)
- prefix = "";
+ if (TreeBuilder_CheckExact(self->target)) {
+ /* shortcut - TreeBuilder does not actually implement .start_ns() */
+ TreeBuilderObject *target = (TreeBuilderObject*) self->target;
- parcel = Py_BuildValue("ss", prefix, uri);
- if (!parcel)
- return;
- treebuilder_append_event(target, target->start_ns_event_obj, parcel);
- Py_DECREF(parcel);
+ if (target->events_append && target->start_ns_event_obj) {
+ prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
+ if (!prefix)
+ return;
+ uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
+ if (!uri) {
+ Py_DECREF(prefix);
+ return;
+ }
+
+ res = treebuilder_handle_start_ns(target, prefix, uri);
+ Py_DECREF(uri);
+ Py_DECREF(prefix);
+ }
+ } else if (self->handle_start_ns) {
+ prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
+ if (!prefix)
+ return;
+ uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
+ if (!uri) {
+ Py_DECREF(prefix);
+ return;
+ }
+
+ stack[0] = prefix;
+ stack[1] = uri;
+ res = _PyObject_FastCall(self->handle_start_ns, stack, 2);
+ Py_DECREF(uri);
+ Py_DECREF(prefix);
+ }
+
+ Py_XDECREF(res);
}
static void
expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
{
- TreeBuilderObject *target = (TreeBuilderObject*) self->target;
+ PyObject *res = NULL;
+ PyObject* prefix;
if (PyErr_Occurred())
return;
- if (!target->events_append)
- return;
+ if (!prefix_in)
+ prefix_in = "";
+
+ if (TreeBuilder_CheckExact(self->target)) {
+ /* shortcut - TreeBuilder does not actually implement .end_ns() */
+ TreeBuilderObject *target = (TreeBuilderObject*) self->target;
+
+ if (target->events_append && target->end_ns_event_obj) {
+ res = treebuilder_handle_end_ns(target, Py_None);
+ }
+ } else if (self->handle_end_ns) {
+ prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
+ if (!prefix)
+ return;
+
+ res = _PyObject_FastCall(self->handle_end_ns, &prefix, 1);
+ Py_DECREF(prefix);
+ }
- treebuilder_append_event(target, target->end_ns_event_obj, Py_None);
+ Py_XDECREF(res);
}
static void
@@ -3546,6 +3628,7 @@ xmlparser_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (self) {
self->parser = NULL;
self->target = self->entity = self->names = NULL;
+ self->handle_start_ns = self->handle_end_ns = NULL;
self->handle_start = self->handle_data = self->handle_end = NULL;
self->handle_comment = self->handle_pi = self->handle_close = NULL;
self->handle_doctype = NULL;
@@ -3614,6 +3697,14 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
}
self->target = target;
+ self->handle_start_ns = PyObject_GetAttrString(target, "start_ns");
+ if (ignore_attribute_error(self->handle_start_ns)) {
+ return -1;
+ }
+ self->handle_end_ns = PyObject_GetAttrString(target, "end_ns");
+ if (ignore_attribute_error(self->handle_end_ns)) {
+ return -1;
+ }
self->handle_start = PyObject_GetAttrString(target, "start");
if (ignore_attribute_error(self->handle_start)) {
return -1;
@@ -3645,6 +3736,12 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
/* configure parser */
EXPAT(SetUserData)(self->parser, self);
+ if (self->handle_start_ns || self->handle_end_ns)
+ EXPAT(SetNamespaceDeclHandler)(
+ self->parser,
+ (XML_StartNamespaceDeclHandler) expat_start_ns_handler,
+ (XML_EndNamespaceDeclHandler) expat_end_ns_handler
+ );
EXPAT(SetElementHandler)(
self->parser,
(XML_StartElementHandler) expat_start_handler,
@@ -3689,6 +3786,9 @@ xmlparser_gc_traverse(XMLParserObject *self, visitproc visit, void *arg)
Py_VISIT(self->handle_end);
Py_VISIT(self->handle_data);
Py_VISIT(self->handle_start);
+ Py_VISIT(self->handle_start_ns);
+ Py_VISIT(self->handle_end_ns);
+ Py_VISIT(self->handle_doctype);
Py_VISIT(self->target);
Py_VISIT(self->entity);
@@ -3712,6 +3812,8 @@ xmlparser_gc_clear(XMLParserObject *self)
Py_CLEAR(self->handle_end);
Py_CLEAR(self->handle_data);
Py_CLEAR(self->handle_start);
+ Py_CLEAR(self->handle_start_ns);
+ Py_CLEAR(self->handle_end_ns);
Py_CLEAR(self->handle_doctype);
Py_CLEAR(self->target);