From 47541689ccea79dfcb055c6be5800b13fcb6bdd2 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 3 May 2019 20:58:16 +0200 Subject: bpo-28238: Implement "{*}tag" and "{ns}*" wildcard tag selection support for ElementPath, and extend the surrounding tests and docs. (GH-12997) --- Doc/library/xml.etree.elementtree.rst | 17 +++- Doc/whatsnew/3.8.rst | 5 ++ Lib/test/test_xml_etree.py | 56 +++++++++++++- Lib/xml/etree/ElementPath.py | 90 +++++++++++++++++++--- .../2019-04-28-15-01-29.bpo-28238.gdk38f.rst | 3 + Modules/_elementtree.c | 15 +++- 6 files changed, 171 insertions(+), 15 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst index ef74d0c..c466731 100644 --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -399,6 +399,12 @@ module. We'll be using the ``countrydata`` XML document from the # All 'neighbor' nodes that are the second child of their parent root.findall(".//neighbor[2]") +For XML with namespaces, use the usual qualified ``{namespace}tag`` notation:: + + # All dublin-core "title" tags in the document + root.findall(".//{http://purl.org/dc/elements/1.1/}title") + + Supported XPath syntax ^^^^^^^^^^^^^^^^^^^^^^ @@ -411,9 +417,16 @@ Supported XPath syntax | | For example, ``spam`` selects all child elements | | | named ``spam``, and ``spam/egg`` selects all | | | grandchildren named ``egg`` in all children named | -| | ``spam``. | +| | ``spam``. ``{namespace}*`` selects all tags in the | +| | given namespace, ``{*}spam`` selects tags named | +| | ``spam`` in any (or no) namespace, and ``{}*`` | +| | only selects tags that are not in a namespace. | +| | | +| | .. versionchanged:: 3.8 | +| | Support for star-wildcards was added. | +-----------------------+------------------------------------------------------+ -| ``*`` | Selects all child elements. For example, ``*/egg`` | +| ``*`` | Selects all child elements, including comments and | +| | processing instructions. For example, ``*/egg`` | | | selects all grandchildren named ``egg``. | +-----------------------+------------------------------------------------------+ | ``.`` | Selects the current node. This is mostly useful | diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index 82be927..764bd00 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -532,6 +532,11 @@ xml external entities by default. (Contributed by Christian Heimes in :issue:`17239`.) +* The ``.find*()`` methods in the :mod:`xml.etree.ElementTree` module + support wildcard searches like ``{*}tag`` which ignores the namespace + and ``{namespace}*`` which returns all tags in the given namespace. + (Contributed by Stefan Behnel in :issue:`28238`.) + * The :mod:`xml.etree.ElementTree` module provides a new function :func:`–xml.etree.ElementTree.canonicalize()` that implements C14N 2.0. (Contributed by Stefan Behnel in :issue:`13611`.) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index a59a11f..ca6862c 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1137,16 +1137,21 @@ class ElementTreeTest(unittest.TestCase): def test_xpath_tokenizer(self): # Test the XPath tokenizer. from xml.etree import ElementPath - def check(p, expected): + def check(p, expected, namespaces=None): self.assertEqual([op or tag - for op, tag in ElementPath.xpath_tokenizer(p)], + for op, tag in ElementPath.xpath_tokenizer(p, namespaces)], expected) # tests from the xml specification check("*", ['*']) + check("{ns}*", ['{ns}*']) + check("{}*", ['{}*']) + check("{*}tag", ['{*}tag']) + check("{*}*", ['{*}*']) check("text()", ['text', '()']) check("@name", ['@', 'name']) check("@*", ['@', '*']) + check("@{ns}attr", ['@', '{ns}attr']) check("para[1]", ['para', '[', '1', ']']) check("para[last()]", ['para', '[', 'last', '()', ']']) check("*/para", ['*', '/', 'para']) @@ -1158,6 +1163,7 @@ class ElementTreeTest(unittest.TestCase): check("//olist/item", ['//', 'olist', '/', 'item']) check(".", ['.']) check(".//para", ['.', '//', 'para']) + check(".//{*}tag", ['.', '//', '{*}tag']) check("..", ['..']) check("../@lang", ['..', '/', '@', 'lang']) check("chapter[title]", ['chapter', '[', 'title', ']']) @@ -1168,6 +1174,8 @@ class ElementTreeTest(unittest.TestCase): check("{http://spam}egg", ['{http://spam}egg']) check("./spam.egg", ['.', '/', 'spam.egg']) check(".//{http://spam}egg", ['.', '//', '{http://spam}egg']) + check("./xsd:type", ['.', '/', '{http://www.w3.org/2001/XMLSchema}type'], + {'xsd': 'http://www.w3.org/2001/XMLSchema'}) def test_processinginstruction(self): # Test ProcessingInstruction directly @@ -2669,6 +2677,50 @@ class ElementFindTest(unittest.TestCase): self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2) self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1) + def test_findall_wildcard(self): + root = ET.XML(''' + + + + + ''') + root.append(ET.Comment('test')) + + self.assertEqual(summarize_list(root.findall("{*}b")), + ['{X}b', 'b', '{Y}b']) + self.assertEqual(summarize_list(root.findall("{*}c")), + ['c']) + self.assertEqual(summarize_list(root.findall("{X}*")), + ['{X}b']) + self.assertEqual(summarize_list(root.findall("{Y}*")), + ['{Y}b']) + self.assertEqual(summarize_list(root.findall("{}*")), + ['b', 'c']) + self.assertEqual(summarize_list(root.findall("{}b")), # only for consistency + ['b']) + self.assertEqual(summarize_list(root.findall("{}b")), + summarize_list(root.findall("b"))) + self.assertEqual(summarize_list(root.findall("{*}*")), + ['{X}b', 'b', 'c', '{Y}b']) + # This is an unfortunate difference, but that's how find('*') works. + self.assertEqual(summarize_list(root.findall("{*}*") + [root[-1]]), + summarize_list(root.findall("*"))) + + self.assertEqual(summarize_list(root.findall(".//{*}b")), + ['{X}b', 'b', '{X}b', 'b', '{Y}b']) + self.assertEqual(summarize_list(root.findall(".//{*}c")), + ['c', 'c']) + self.assertEqual(summarize_list(root.findall(".//{X}*")), + ['{X}b', '{X}b']) + self.assertEqual(summarize_list(root.findall(".//{Y}*")), + ['{Y}b']) + self.assertEqual(summarize_list(root.findall(".//{}*")), + ['c', 'b', 'c', 'b']) + self.assertEqual(summarize_list(root.findall(".//{}b")), # only for consistency + ['b', 'b']) + self.assertEqual(summarize_list(root.findall(".//{}b")), + summarize_list(root.findall(".//b"))) + def test_bad_find(self): e = ET.XML(SAMPLE_XML) with self.assertRaisesRegex(SyntaxError, 'cannot use absolute path'): diff --git a/Lib/xml/etree/ElementPath.py b/Lib/xml/etree/ElementPath.py index b670d58..cfe72f2 100644 --- a/Lib/xml/etree/ElementPath.py +++ b/Lib/xml/etree/ElementPath.py @@ -99,13 +99,70 @@ def get_parent_map(context): parent_map[e] = p return parent_map + + +def _is_wildcard_tag(tag): + return tag[:3] == '{*}' or tag[-2:] == '}*' + + +def _prepare_tag(tag): + _isinstance, _str = isinstance, str + if tag == '{*}*': + # Same as '*', but no comments or processing instructions. + # It can be a surprise that '*' includes those, but there is no + # justification for '{*}*' doing the same. + def select(context, result): + for elem in result: + if _isinstance(elem.tag, _str): + yield elem + elif tag == '{}*': + # Any tag that is not in a namespace. + def select(context, result): + for elem in result: + el_tag = elem.tag + if _isinstance(el_tag, _str) and el_tag[0] != '{': + yield elem + elif tag[:3] == '{*}': + # The tag in any (or no) namespace. + suffix = tag[2:] # '}name' + no_ns = slice(-len(suffix), None) + tag = tag[3:] + def select(context, result): + for elem in result: + el_tag = elem.tag + if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix: + yield elem + elif tag[-2:] == '}*': + # Any tag in the given namespace. + ns = tag[:-1] + ns_only = slice(None, len(ns)) + def select(context, result): + for elem in result: + el_tag = elem.tag + if _isinstance(el_tag, _str) and el_tag[ns_only] == ns: + yield elem + else: + raise RuntimeError(f"internal parser error, got {tag}") + return select + + def prepare_child(next, token): tag = token[1] - def select(context, result): - for elem in result: - for e in elem: - if e.tag == tag: - yield e + if _is_wildcard_tag(tag): + select_tag = _prepare_tag(tag) + def select(context, result): + def select_child(result): + for elem in result: + yield from elem + return select_tag(context, select_child(result)) + else: + if tag[:2] == '{}': + tag = tag[2:] # '{}tag' == 'tag' + def select(context, result): + for elem in result: + for e in elem: + if e.tag == tag: + yield e return select def prepare_star(next, token): @@ -130,11 +187,24 @@ def prepare_descendant(next, token): tag = token[1] else: raise SyntaxError("invalid descendant") - def select(context, result): - for elem in result: - for e in elem.iter(tag): - if e is not elem: - yield e + + if _is_wildcard_tag(tag): + select_tag = _prepare_tag(tag) + def select(context, result): + def select_child(result): + for elem in result: + for e in elem.iter(): + if e is not elem: + yield e + return select_tag(context, select_child(result)) + else: + if tag[:2] == '{}': + tag = tag[2:] # '{}tag' == 'tag' + def select(context, result): + for elem in result: + for e in elem.iter(tag): + if e is not elem: + yield e return select def prepare_parent(next, token): diff --git a/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst b/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst new file mode 100644 index 0000000..62003a3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst @@ -0,0 +1,3 @@ +The ``.find*()`` methods of xml.etree.ElementTree can now search for +wildcards like ``{*}tag`` and ``{ns}*`` that match a tag in any namespace +or all tags in a namespace. Patch by Stefan Behnel. diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index b69e3a4..1e58ddb 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -1149,6 +1149,13 @@ checkpath(PyObject* tag) const Py_ssize_t len = PyUnicode_GET_LENGTH(tag); void *data = PyUnicode_DATA(tag); unsigned int kind = PyUnicode_KIND(tag); + if (len >= 3 && PyUnicode_READ(kind, data, 0) == '{' && ( + PyUnicode_READ(kind, data, 1) == '}' || ( + PyUnicode_READ(kind, data, 1) == '*' && + PyUnicode_READ(kind, data, 2) == '}'))) { + /* wildcard: '{}tag' or '{*}tag' */ + return 1; + } for (i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); if (ch == '{') @@ -1162,7 +1169,13 @@ checkpath(PyObject* tag) } if (PyBytes_Check(tag)) { char *p = PyBytes_AS_STRING(tag); - for (i = 0; i < PyBytes_GET_SIZE(tag); i++) { + const Py_ssize_t len = PyBytes_GET_SIZE(tag); + if (len >= 3 && p[0] == '{' && ( + p[1] == '}' || p[1] == '*' && p[2] == '}')) { + /* wildcard: '{}tag' or '{*}tag' */ + return 1; + } + for (i = 0; i < len; i++) { if (p[i] == '{') check = 0; else if (p[i] == '}') -- cgit v0.12