bpo-28238: Implement "{*}tag" and "{ns}*" wildcard tag selection support for ElementPath, and extend the surrounding tests and docs. (GH-12997)

author: Stefan Behnel <stefan_ml@behnel.de> 2019-05-03 18:58:16 (GMT)
committer: GitHub <noreply@github.com> 2019-05-03 18:58:16 (GMT)
commit: 47541689ccea79dfcb055c6be5800b13fcb6bdd2 (patch)
tree: 7580016557a064cc019fe41d1d62e57ac3dcc8c6 /Lib
parent: cf48e55f7f7718482fa712552f0cbc0aea1c826f (diff)
download: cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.zip
cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.tar.gz
cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.tar.bz2
2 files changed, 134 insertions, 12 deletions
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index a59a11f..ca6862c 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1137,16 +1137,21 @@ class ElementTreeTest(unittest.TestCase):
     def test_xpath_tokenizer(self):
         # Test the XPath tokenizer.
         from xml.etree import ElementPath
-        def check(p, expected):
+        def check(p, expected, namespaces=None):
             self.assertEqual([op or tag
-                              for op, tag in ElementPath.xpath_tokenizer(p)],
+                              for op, tag in ElementPath.xpath_tokenizer(p, namespaces)],
                              expected)
 
         # tests from the xml specification
         check("*", ['*'])
+        check("{ns}*", ['{ns}*'])
+        check("{}*", ['{}*'])
+        check("{*}tag", ['{*}tag'])
+        check("{*}*", ['{*}*'])
         check("text()", ['text', '()'])
         check("@name", ['@', 'name'])
         check("@*", ['@', '*'])
+        check("@{ns}attr", ['@', '{ns}attr'])
         check("para[1]", ['para', '[', '1', ']'])
         check("para[last()]", ['para', '[', 'last', '()', ']'])
         check("*/para", ['*', '/', 'para'])
@@ -1158,6 +1163,7 @@ class ElementTreeTest(unittest.TestCase):
         check("//olist/item", ['//', 'olist', '/', 'item'])
         check(".", ['.'])
         check(".//para", ['.', '//', 'para'])
+        check(".//{*}tag", ['.', '//', '{*}tag'])
         check("..", ['..'])
         check("../@lang", ['..', '/', '@', 'lang'])
         check("chapter[title]", ['chapter', '[', 'title', ']'])
@@ -1168,6 +1174,8 @@ class ElementTreeTest(unittest.TestCase):
         check("{http://spam}egg", ['{http://spam}egg'])
         check("./spam.egg", ['.', '/', 'spam.egg'])
         check(".//{http://spam}egg", ['.', '//', '{http://spam}egg'])
+        check("./xsd:type", ['.', '/', '{http://www.w3.org/2001/XMLSchema}type'],
+              {'xsd': 'http://www.w3.org/2001/XMLSchema'})
 
     def test_processinginstruction(self):
         # Test ProcessingInstruction directly
@@ -2669,6 +2677,50 @@ class ElementFindTest(unittest.TestCase):
         self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
         self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)
 
+    def test_findall_wildcard(self):
+        root = ET.XML('''
+            <a xmlns:x="X" xmlns:y="Y">
+                <x:b><c/></x:b>
+                <b/>
+                <c><x:b/><b/></c><y:b/>
+            </a>''')
+        root.append(ET.Comment('test'))
+
+        self.assertEqual(summarize_list(root.findall("{*}b")),
+                         ['{X}b', 'b', '{Y}b'])
+        self.assertEqual(summarize_list(root.findall("{*}c")),
+                         ['c'])
+        self.assertEqual(summarize_list(root.findall("{X}*")),
+                         ['{X}b'])
+        self.assertEqual(summarize_list(root.findall("{Y}*")),
+                         ['{Y}b'])
+        self.assertEqual(summarize_list(root.findall("{}*")),
+                         ['b', 'c'])
+        self.assertEqual(summarize_list(root.findall("{}b")),  # only for consistency
+                         ['b'])
+        self.assertEqual(summarize_list(root.findall("{}b")),
+                         summarize_list(root.findall("b")))
+        self.assertEqual(summarize_list(root.findall("{*}*")),
+                         ['{X}b', 'b', 'c', '{Y}b'])
+        # This is an unfortunate difference, but that's how find('*') works.
+        self.assertEqual(summarize_list(root.findall("{*}*") + [root[-1]]),
+                         summarize_list(root.findall("*")))
+
+        self.assertEqual(summarize_list(root.findall(".//{*}b")),
+                         ['{X}b', 'b', '{X}b', 'b', '{Y}b'])
+        self.assertEqual(summarize_list(root.findall(".//{*}c")),
+                         ['c', 'c'])
+        self.assertEqual(summarize_list(root.findall(".//{X}*")),
+                         ['{X}b', '{X}b'])
+        self.assertEqual(summarize_list(root.findall(".//{Y}*")),
+                         ['{Y}b'])
+        self.assertEqual(summarize_list(root.findall(".//{}*")),
+                         ['c', 'b', 'c', 'b'])
+        self.assertEqual(summarize_list(root.findall(".//{}b")),  # only for consistency
+                         ['b', 'b'])
+        self.assertEqual(summarize_list(root.findall(".//{}b")),
+                         summarize_list(root.findall(".//b")))
+
     def test_bad_find(self):
         e = ET.XML(SAMPLE_XML)
         with self.assertRaisesRegex(SyntaxError, 'cannot use absolute path'):
diff --git a/Lib/xml/etree/ElementPath.py b/Lib/xml/etree/ElementPath.py
index b670d58..cfe72f2 100644
--- a/Lib/xml/etree/ElementPath.py
+++ b/Lib/xml/etree/ElementPath.py
@@ -99,13 +99,70 @@ def get_parent_map(context):
                 parent_map[e] = p
     return parent_map
 
+
+
+def _is_wildcard_tag(tag):
+    return tag[:3] == '{*}' or tag[-2:] == '}*'
+
+
+def _prepare_tag(tag):
+    _isinstance, _str = isinstance, str
+    if tag == '{*}*':
+        # Same as '*', but no comments or processing instructions.
+        # It can be a surprise that '*' includes those, but there is no
+        # justification for '{*}*' doing the same.
+        def select(context, result):
+            for elem in result:
+                if _isinstance(elem.tag, _str):
+                    yield elem
+    elif tag == '{}*':
+        # Any tag that is not in a namespace.
+        def select(context, result):
+            for elem in result:
+                el_tag = elem.tag
+                if _isinstance(el_tag, _str) and el_tag[0] != '{':
+                    yield elem
+    elif tag[:3] == '{*}':
+        # The tag in any (or no) namespace.
+        suffix = tag[2:]  # '}name'
+        no_ns = slice(-len(suffix), None)
+        tag = tag[3:]
+        def select(context, result):
+            for elem in result:
+                el_tag = elem.tag
+                if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix:
+                    yield elem
+    elif tag[-2:] == '}*':
+        # Any tag in the given namespace.
+        ns = tag[:-1]
+        ns_only = slice(None, len(ns))
+        def select(context, result):
+            for elem in result:
+                el_tag = elem.tag
+                if _isinstance(el_tag, _str) and el_tag[ns_only] == ns:
+                    yield elem
+    else:
+        raise RuntimeError(f"internal parser error, got {tag}")
+    return select
+
+
 def prepare_child(next, token):
     tag = token[1]
-    def select(context, result):
-        for elem in result:
-            for e in elem:
-                if e.tag == tag:
-                    yield e
+    if _is_wildcard_tag(tag):
+        select_tag = _prepare_tag(tag)
+        def select(context, result):
+            def select_child(result):
+                for elem in result:
+                    yield from elem
+            return select_tag(context, select_child(result))
+    else:
+        if tag[:2] == '{}':
+            tag = tag[2:]  # '{}tag' == 'tag'
+        def select(context, result):
+            for elem in result:
+                for e in elem:
+                    if e.tag == tag:
+                        yield e
     return select
 
 def prepare_star(next, token):
@@ -130,11 +187,24 @@ def prepare_descendant(next, token):
         tag = token[1]
     else:
         raise SyntaxError("invalid descendant")
-    def select(context, result):
-        for elem in result:
-            for e in elem.iter(tag):
-                if e is not elem:
-                    yield e
+
+    if _is_wildcard_tag(tag):
+        select_tag = _prepare_tag(tag)
+        def select(context, result):
+            def select_child(result):
+                for elem in result:
+                    for e in elem.iter():
+                        if e is not elem:
+                            yield e
+            return select_tag(context, select_child(result))
+    else:
+        if tag[:2] == '{}':
+            tag = tag[2:]  # '{}tag' == 'tag'
+        def select(context, result):
+            for elem in result:
+                for e in elem.iter(tag):
+                    if e is not elem:
+                        yield e
     return select
 
 def prepare_parent(next, token):
author	Stefan Behnel <stefan_ml@behnel.de>	2019-05-03 18:58:16 (GMT)
committer	GitHub <noreply@github.com>	2019-05-03 18:58:16 (GMT)
commit	47541689ccea79dfcb055c6be5800b13fcb6bdd2 (patch)
tree	7580016557a064cc019fe41d1d62e57ac3dcc8c6 /Lib
parent	cf48e55f7f7718482fa712552f0cbc0aea1c826f (diff)
download	cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.zip cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.tar.gz cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.tar.bz2