summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2019-05-03 18:58:16 (GMT)
committerGitHub <noreply@github.com>2019-05-03 18:58:16 (GMT)
commit47541689ccea79dfcb055c6be5800b13fcb6bdd2 (patch)
tree7580016557a064cc019fe41d1d62e57ac3dcc8c6
parentcf48e55f7f7718482fa712552f0cbc0aea1c826f (diff)
downloadcpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.zip
cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.tar.gz
cpython-47541689ccea79dfcb055c6be5800b13fcb6bdd2.tar.bz2
bpo-28238: Implement "{*}tag" and "{ns}*" wildcard tag selection support for ElementPath, and extend the surrounding tests and docs. (GH-12997)
-rw-r--r--Doc/library/xml.etree.elementtree.rst17
-rw-r--r--Doc/whatsnew/3.8.rst5
-rw-r--r--Lib/test/test_xml_etree.py56
-rw-r--r--Lib/xml/etree/ElementPath.py90
-rw-r--r--Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst3
-rw-r--r--Modules/_elementtree.c15
6 files changed, 171 insertions, 15 deletions
diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst
index ef74d0c..c466731 100644
--- a/Doc/library/xml.etree.elementtree.rst
+++ b/Doc/library/xml.etree.elementtree.rst
@@ -399,6 +399,12 @@ module. We'll be using the ``countrydata`` XML document from the
# All 'neighbor' nodes that are the second child of their parent
root.findall(".//neighbor[2]")
+For XML with namespaces, use the usual qualified ``{namespace}tag`` notation::
+
+ # All dublin-core "title" tags in the document
+ root.findall(".//{http://purl.org/dc/elements/1.1/}title")
+
+
Supported XPath syntax
^^^^^^^^^^^^^^^^^^^^^^
@@ -411,9 +417,16 @@ Supported XPath syntax
| | For example, ``spam`` selects all child elements |
| | named ``spam``, and ``spam/egg`` selects all |
| | grandchildren named ``egg`` in all children named |
-| | ``spam``. |
+| | ``spam``. ``{namespace}*`` selects all tags in the |
+| | given namespace, ``{*}spam`` selects tags named |
+| | ``spam`` in any (or no) namespace, and ``{}*`` |
+| | only selects tags that are not in a namespace. |
+| | |
+| | .. versionchanged:: 3.8 |
+| | Support for star-wildcards was added. |
+-----------------------+------------------------------------------------------+
-| ``*`` | Selects all child elements. For example, ``*/egg`` |
+| ``*`` | Selects all child elements, including comments and |
+| | processing instructions. For example, ``*/egg`` |
| | selects all grandchildren named ``egg``. |
+-----------------------+------------------------------------------------------+
| ``.`` | Selects the current node. This is mostly useful |
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index 82be927..764bd00 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -532,6 +532,11 @@ xml
external entities by default.
(Contributed by Christian Heimes in :issue:`17239`.)
+* The ``.find*()`` methods in the :mod:`xml.etree.ElementTree` module
+ support wildcard searches like ``{*}tag`` which ignores the namespace
+ and ``{namespace}*`` which returns all tags in the given namespace.
+ (Contributed by Stefan Behnel in :issue:`28238`.)
+
* The :mod:`xml.etree.ElementTree` module provides a new function
:func:`–xml.etree.ElementTree.canonicalize()` that implements C14N 2.0.
(Contributed by Stefan Behnel in :issue:`13611`.)
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index a59a11f..ca6862c 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1137,16 +1137,21 @@ class ElementTreeTest(unittest.TestCase):
def test_xpath_tokenizer(self):
# Test the XPath tokenizer.
from xml.etree import ElementPath
- def check(p, expected):
+ def check(p, expected, namespaces=None):
self.assertEqual([op or tag
- for op, tag in ElementPath.xpath_tokenizer(p)],
+ for op, tag in ElementPath.xpath_tokenizer(p, namespaces)],
expected)
# tests from the xml specification
check("*", ['*'])
+ check("{ns}*", ['{ns}*'])
+ check("{}*", ['{}*'])
+ check("{*}tag", ['{*}tag'])
+ check("{*}*", ['{*}*'])
check("text()", ['text', '()'])
check("@name", ['@', 'name'])
check("@*", ['@', '*'])
+ check("@{ns}attr", ['@', '{ns}attr'])
check("para[1]", ['para', '[', '1', ']'])
check("para[last()]", ['para', '[', 'last', '()', ']'])
check("*/para", ['*', '/', 'para'])
@@ -1158,6 +1163,7 @@ class ElementTreeTest(unittest.TestCase):
check("//olist/item", ['//', 'olist', '/', 'item'])
check(".", ['.'])
check(".//para", ['.', '//', 'para'])
+ check(".//{*}tag", ['.', '//', '{*}tag'])
check("..", ['..'])
check("../@lang", ['..', '/', '@', 'lang'])
check("chapter[title]", ['chapter', '[', 'title', ']'])
@@ -1168,6 +1174,8 @@ class ElementTreeTest(unittest.TestCase):
check("{http://spam}egg", ['{http://spam}egg'])
check("./spam.egg", ['.', '/', 'spam.egg'])
check(".//{http://spam}egg", ['.', '//', '{http://spam}egg'])
+ check("./xsd:type", ['.', '/', '{http://www.w3.org/2001/XMLSchema}type'],
+ {'xsd': 'http://www.w3.org/2001/XMLSchema'})
def test_processinginstruction(self):
# Test ProcessingInstruction directly
@@ -2669,6 +2677,50 @@ class ElementFindTest(unittest.TestCase):
self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)
+ def test_findall_wildcard(self):
+ root = ET.XML('''
+ <a xmlns:x="X" xmlns:y="Y">
+ <x:b><c/></x:b>
+ <b/>
+ <c><x:b/><b/></c><y:b/>
+ </a>''')
+ root.append(ET.Comment('test'))
+
+ self.assertEqual(summarize_list(root.findall("{*}b")),
+ ['{X}b', 'b', '{Y}b'])
+ self.assertEqual(summarize_list(root.findall("{*}c")),
+ ['c'])
+ self.assertEqual(summarize_list(root.findall("{X}*")),
+ ['{X}b'])
+ self.assertEqual(summarize_list(root.findall("{Y}*")),
+ ['{Y}b'])
+ self.assertEqual(summarize_list(root.findall("{}*")),
+ ['b', 'c'])
+ self.assertEqual(summarize_list(root.findall("{}b")), # only for consistency
+ ['b'])
+ self.assertEqual(summarize_list(root.findall("{}b")),
+ summarize_list(root.findall("b")))
+ self.assertEqual(summarize_list(root.findall("{*}*")),
+ ['{X}b', 'b', 'c', '{Y}b'])
+ # This is an unfortunate difference, but that's how find('*') works.
+ self.assertEqual(summarize_list(root.findall("{*}*") + [root[-1]]),
+ summarize_list(root.findall("*")))
+
+ self.assertEqual(summarize_list(root.findall(".//{*}b")),
+ ['{X}b', 'b', '{X}b', 'b', '{Y}b'])
+ self.assertEqual(summarize_list(root.findall(".//{*}c")),
+ ['c', 'c'])
+ self.assertEqual(summarize_list(root.findall(".//{X}*")),
+ ['{X}b', '{X}b'])
+ self.assertEqual(summarize_list(root.findall(".//{Y}*")),
+ ['{Y}b'])
+ self.assertEqual(summarize_list(root.findall(".//{}*")),
+ ['c', 'b', 'c', 'b'])
+ self.assertEqual(summarize_list(root.findall(".//{}b")), # only for consistency
+ ['b', 'b'])
+ self.assertEqual(summarize_list(root.findall(".//{}b")),
+ summarize_list(root.findall(".//b")))
+
def test_bad_find(self):
e = ET.XML(SAMPLE_XML)
with self.assertRaisesRegex(SyntaxError, 'cannot use absolute path'):
diff --git a/Lib/xml/etree/ElementPath.py b/Lib/xml/etree/ElementPath.py
index b670d58..cfe72f2 100644
--- a/Lib/xml/etree/ElementPath.py
+++ b/Lib/xml/etree/ElementPath.py
@@ -99,13 +99,70 @@ def get_parent_map(context):
parent_map[e] = p
return parent_map
+
+
+def _is_wildcard_tag(tag):
+ return tag[:3] == '{*}' or tag[-2:] == '}*'
+
+
+def _prepare_tag(tag):
+ _isinstance, _str = isinstance, str
+ if tag == '{*}*':
+ # Same as '*', but no comments or processing instructions.
+ # It can be a surprise that '*' includes those, but there is no
+ # justification for '{*}*' doing the same.
+ def select(context, result):
+ for elem in result:
+ if _isinstance(elem.tag, _str):
+ yield elem
+ elif tag == '{}*':
+ # Any tag that is not in a namespace.
+ def select(context, result):
+ for elem in result:
+ el_tag = elem.tag
+ if _isinstance(el_tag, _str) and el_tag[0] != '{':
+ yield elem
+ elif tag[:3] == '{*}':
+ # The tag in any (or no) namespace.
+ suffix = tag[2:] # '}name'
+ no_ns = slice(-len(suffix), None)
+ tag = tag[3:]
+ def select(context, result):
+ for elem in result:
+ el_tag = elem.tag
+ if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix:
+ yield elem
+ elif tag[-2:] == '}*':
+ # Any tag in the given namespace.
+ ns = tag[:-1]
+ ns_only = slice(None, len(ns))
+ def select(context, result):
+ for elem in result:
+ el_tag = elem.tag
+ if _isinstance(el_tag, _str) and el_tag[ns_only] == ns:
+ yield elem
+ else:
+ raise RuntimeError(f"internal parser error, got {tag}")
+ return select
+
+
def prepare_child(next, token):
tag = token[1]
- def select(context, result):
- for elem in result:
- for e in elem:
- if e.tag == tag:
- yield e
+ if _is_wildcard_tag(tag):
+ select_tag = _prepare_tag(tag)
+ def select(context, result):
+ def select_child(result):
+ for elem in result:
+ yield from elem
+ return select_tag(context, select_child(result))
+ else:
+ if tag[:2] == '{}':
+ tag = tag[2:] # '{}tag' == 'tag'
+ def select(context, result):
+ for elem in result:
+ for e in elem:
+ if e.tag == tag:
+ yield e
return select
def prepare_star(next, token):
@@ -130,11 +187,24 @@ def prepare_descendant(next, token):
tag = token[1]
else:
raise SyntaxError("invalid descendant")
- def select(context, result):
- for elem in result:
- for e in elem.iter(tag):
- if e is not elem:
- yield e
+
+ if _is_wildcard_tag(tag):
+ select_tag = _prepare_tag(tag)
+ def select(context, result):
+ def select_child(result):
+ for elem in result:
+ for e in elem.iter():
+ if e is not elem:
+ yield e
+ return select_tag(context, select_child(result))
+ else:
+ if tag[:2] == '{}':
+ tag = tag[2:] # '{}tag' == 'tag'
+ def select(context, result):
+ for elem in result:
+ for e in elem.iter(tag):
+ if e is not elem:
+ yield e
return select
def prepare_parent(next, token):
diff --git a/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst b/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst
new file mode 100644
index 0000000..62003a3
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst
@@ -0,0 +1,3 @@
+The ``.find*()`` methods of xml.etree.ElementTree can now search for
+wildcards like ``{*}tag`` and ``{ns}*`` that match a tag in any namespace
+or all tags in a namespace. Patch by Stefan Behnel.
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
index b69e3a4..1e58ddb 100644
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -1149,6 +1149,13 @@ checkpath(PyObject* tag)
const Py_ssize_t len = PyUnicode_GET_LENGTH(tag);
void *data = PyUnicode_DATA(tag);
unsigned int kind = PyUnicode_KIND(tag);
+ if (len >= 3 && PyUnicode_READ(kind, data, 0) == '{' && (
+ PyUnicode_READ(kind, data, 1) == '}' || (
+ PyUnicode_READ(kind, data, 1) == '*' &&
+ PyUnicode_READ(kind, data, 2) == '}'))) {
+ /* wildcard: '{}tag' or '{*}tag' */
+ return 1;
+ }
for (i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch == '{')
@@ -1162,7 +1169,13 @@ checkpath(PyObject* tag)
}
if (PyBytes_Check(tag)) {
char *p = PyBytes_AS_STRING(tag);
- for (i = 0; i < PyBytes_GET_SIZE(tag); i++) {
+ const Py_ssize_t len = PyBytes_GET_SIZE(tag);
+ if (len >= 3 && p[0] == '{' && (
+ p[1] == '}' || p[1] == '*' && p[2] == '}')) {
+ /* wildcard: '{}tag' or '{*}tag' */
+ return 1;
+ }
+ for (i = 0; i < len; i++) {
if (p[i] == '{')
check = 0;
else if (p[i] == '}')