summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFlorent Xicluna <florent.xicluna@gmail.com>2010-08-08 19:48:29 (GMT)
committerFlorent Xicluna <florent.xicluna@gmail.com>2010-08-08 19:48:29 (GMT)
commitc17f17294fb31967b52b5b7039d7587e0f82c3bb (patch)
tree8d7021b079f787e45382391cd3327a0127f2923d
parent1a0a737b131b5c54ba32a8dc707db39b4ce03674 (diff)
downloadcpython-c17f17294fb31967b52b5b7039d7587e0f82c3bb.zip
cpython-c17f17294fb31967b52b5b7039d7587e0f82c3bb.tar.gz
cpython-c17f17294fb31967b52b5b7039d7587e0f82c3bb.tar.bz2
Issue #8047: Fix the xml.etree serializer to return bytes by default.
Use ``encoding="unicode"`` to generate a Unicode string.
-rw-r--r--Doc/library/xml.etree.elementtree.rst22
-rw-r--r--Lib/test/test_xml_etree.py46
-rw-r--r--Lib/xml/etree/ElementTree.py59
-rw-r--r--Misc/NEWS3
4 files changed, 73 insertions, 57 deletions
diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst
index 892a59a..32057d4 100644
--- a/Doc/library/xml.etree.elementtree.rst
+++ b/Doc/library/xml.etree.elementtree.rst
@@ -148,20 +148,22 @@ Functions
arguments. Returns an element instance.
-.. function:: tostring(element, encoding=None, method="xml")
+.. function:: tostring(element, encoding="us-ascii", method="xml")
Generates a string representation of an XML element, including all
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
- the output encoding (default is None). *method* is either ``"xml"``,
+ the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
+ generate a Unicode string. *method* is either ``"xml"``,
``"html"`` or ``"text"`` (default is ``"xml"``). Returns an (optionally)
encoded string containing the XML data.
-.. function:: tostringlist(element, encoding=None, method="xml")
+.. function:: tostringlist(element, encoding="us-ascii", method="xml")
Generates a string representation of an XML element, including all
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
- the output encoding (default is None). *method* is either ``"xml"``,
+ the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
+ generate a Unicode string. *method* is either ``"xml"``,
``"html"`` or ``"text"`` (default is ``"xml"``). Returns a list of
(optionally) encoded strings containing the XML data. It does not guarantee
any specific sequence, except that ``"".join(tostringlist(element)) ==
@@ -430,6 +432,7 @@ ElementTree Objects
.. method:: getroot()
+
Returns the root element for this tree.
@@ -457,15 +460,16 @@ ElementTree Objects
root element.
- .. method:: write(file, encoding=None, xml_declaration=None, method="xml")
+ .. method:: write(file, encoding="us-ascii", xml_declaration=None, method="xml")
Writes the element tree to a file, as XML. *file* is a file name, or a
file object opened for writing. *encoding* [1]_ is the output encoding
- (default is None). *xml_declaration* controls if an XML declaration
+ (default is US-ASCII). Use ``encoding="unicode"`` to write a Unicode string.
+ *xml_declaration* controls if an XML declaration
should be added to the file. Use False for never, True for always, None
- for only if not US-ASCII or UTF-8 (default is None). *method* is either
- ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). Returns an
- (optionally) encoded string.
+ for only if not US-ASCII or UTF-8 or Unicode (default is None). *method* is
+ either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).
+ Returns an (optionally) encoded string.
This is the XML file that is going to be manipulated::
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index e7c8a89..0dc400f 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -71,14 +71,14 @@ def check_method(method):
if not hasattr(method, '__call__'):
print(method, "not callable")
-def serialize(elem, to_string=True, **options):
+def serialize(elem, to_string=True, encoding='unicode', **options):
import io
- if options.get("encoding"):
+ if encoding != 'unicode':
file = io.BytesIO()
else:
file = io.StringIO()
tree = ET.ElementTree(elem)
- tree.write(file, **options)
+ tree.write(file, encoding=encoding, **options)
if to_string:
return file.getvalue()
else:
@@ -537,7 +537,7 @@ def attrib():
>>> elem.set('testa', 'testval')
>>> elem.set('testb', 'test2')
>>> ET.tostring(elem)
- '<test testa="testval" testb="test2">aa</test>'
+ b'<test testa="testval" testb="test2">aa</test>'
>>> sorted(elem.keys())
['testa', 'testb']
>>> sorted(elem.items())
@@ -547,7 +547,7 @@ def attrib():
>>> elem.attrib['testb'] = 'test1'
>>> elem.attrib['testc'] = 'test2'
>>> ET.tostring(elem)
- '<test testa="testval" testb="test1" testc="test2">aa</test>'
+ b'<test testa="testval" testb="test1" testc="test2">aa</test>'
"""
def makeelement():
@@ -587,7 +587,7 @@ def parsefile():
>>> tree = ET.parse(SIMPLE_XMLFILE)
>>> normalize_crlf(tree)
- >>> tree.write(sys.stdout)
+ >>> tree.write(sys.stdout, encoding='unicode')
<root>
<element key="value">text</element>
<element>text</element>tail
@@ -595,7 +595,7 @@ def parsefile():
</root>
>>> tree = ET.parse(SIMPLE_NS_XMLFILE)
>>> normalize_crlf(tree)
- >>> tree.write(sys.stdout)
+ >>> tree.write(sys.stdout, encoding='unicode')
<ns0:root xmlns:ns0="namespace">
<ns0:element key="value">text</ns0:element>
<ns0:element>text</ns0:element>tail
@@ -636,17 +636,17 @@ def parsefile():
def parseliteral():
"""
>>> element = ET.XML("<html><body>text</body></html>")
- >>> ET.ElementTree(element).write(sys.stdout)
+ >>> ET.ElementTree(element).write(sys.stdout, encoding='unicode')
<html><body>text</body></html>
>>> element = ET.fromstring("<html><body>text</body></html>")
- >>> ET.ElementTree(element).write(sys.stdout)
+ >>> ET.ElementTree(element).write(sys.stdout, encoding='unicode')
<html><body>text</body></html>
>>> sequence = ["<html><body>", "text</bo", "dy></html>"]
>>> element = ET.fromstringlist(sequence)
>>> print(ET.tostring(element))
- <html><body>text</body></html>
- >>> print("".join(ET.tostringlist(element)))
- <html><body>text</body></html>
+ b'<html><body>text</body></html>'
+ >>> print(b"".join(ET.tostringlist(element)))
+ b'<html><body>text</body></html>'
>>> ET.tostring(element, "ascii")
b"<?xml version='1.0' encoding='ascii'?>\\n<html><body>text</body></html>"
>>> _, ids = ET.XMLID("<html><body>text</body></html>")
@@ -875,10 +875,10 @@ def writestring():
"""
>>> elem = ET.XML("<html><body>text</body></html>")
>>> ET.tostring(elem)
- '<html><body>text</body></html>'
+ b'<html><body>text</body></html>'
>>> elem = ET.fromstring("<html><body>text</body></html>")
>>> ET.tostring(elem)
- '<html><body>text</body></html>'
+ b'<html><body>text</body></html>'
"""
def check_encoding(encoding):
@@ -1233,14 +1233,14 @@ def processinginstruction():
Test ProcessingInstruction directly
>>> ET.tostring(ET.ProcessingInstruction('test', 'instruction'))
- '<?test instruction?>'
+ b'<?test instruction?>'
>>> ET.tostring(ET.PI('test', 'instruction'))
- '<?test instruction?>'
+ b'<?test instruction?>'
Issue #2746
>>> ET.tostring(ET.PI('test', '<testing&>'))
- '<?test <testing&>?>'
+ b'<?test <testing&>?>'
>>> ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin1')
b"<?xml version='1.0' encoding='latin1'?>\\n<?test <testing&>\\xe3?>"
"""
@@ -1643,11 +1643,11 @@ def bug_200708_newline():
>>> e = ET.Element('SomeTag', text="def _f():\n return 3\n")
>>> ET.tostring(e)
- '<SomeTag text="def _f():&#10; return 3&#10;" />'
+ b'<SomeTag text="def _f():&#10; return 3&#10;" />'
>>> ET.XML(ET.tostring(e)).get("text")
'def _f():\n return 3\n'
>>> ET.tostring(ET.XML(ET.tostring(e)))
- '<SomeTag text="def _f():&#10; return 3&#10;" />'
+ b'<SomeTag text="def _f():&#10; return 3&#10;" />'
"""
@@ -1698,15 +1698,15 @@ def bug_200709_register_namespace():
"""
>>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title"))
- '<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />'
+ b'<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />'
>>> ET.register_namespace("foo", "http://namespace.invalid/does/not/exist/")
>>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title"))
- '<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />'
+ b'<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />'
And the Dublin Core namespace is in the default list:
>>> ET.tostring(ET.Element("{http://purl.org/dc/elements/1.1/}title"))
- '<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />'
+ b'<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />'
"""
@@ -1792,7 +1792,7 @@ def check_issue3151():
'{${stuff}}localname'
>>> t = ET.ElementTree(e)
>>> ET.tostring(e)
- '<ns0:localname xmlns:ns0="${stuff}" />'
+ b'<ns0:localname xmlns:ns0="${stuff}" />'
"""
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
index c26a764..ecc8ea7 100644
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -792,12 +792,13 @@ class ElementTree:
# @def write(file, **options)
# @param file A file name, or a file object opened for writing.
# @param **options Options, given as keyword arguments.
- # @keyparam encoding Optional output encoding (default is None).
+ # @keyparam encoding Optional output encoding (default is US-ASCII).
+ # Use "unicode" to return a Unicode string.
# @keyparam method Optional output method ("xml", "html", "text" or
# "c14n"; default is "xml").
# @keyparam xml_declaration Controls if an XML declaration should
# be added to the file. Use False for never, True for always,
- # None for only if not US-ASCII or UTF-8. None is default.
+ # None for only if not US-ASCII or UTF-8 or Unicode. None is default.
def write(self, file_or_filename,
# keyword arguments
@@ -811,14 +812,23 @@ class ElementTree:
elif method not in _serialize:
# FIXME: raise an ImportError for c14n if ElementC14N is missing?
raise ValueError("unknown method %r" % method)
+ if not encoding:
+ if method == "c14n":
+ encoding = "utf-8"
+ else:
+ encoding = "us-ascii"
+ elif encoding == str: # lxml.etree compatibility.
+ encoding = "unicode"
+ else:
+ encoding = encoding.lower()
if hasattr(file_or_filename, "write"):
file = file_or_filename
else:
- if encoding:
+ if encoding != "unicode":
file = open(file_or_filename, "wb")
else:
file = open(file_or_filename, "w")
- if encoding:
+ if encoding != "unicode":
def write(text):
try:
return file.write(text.encode(encoding,
@@ -827,20 +837,15 @@ class ElementTree:
_raise_serialization_error(text)
else:
write = file.write
- if not encoding:
- if method == "c14n":
- encoding = "utf-8"
- else:
- encoding = None
- elif xml_declaration or (xml_declaration is None and
- encoding not in ("utf-8", "us-ascii")):
- if method == "xml":
- encoding_ = encoding
- if not encoding:
- # Retrieve the default encoding for the xml declaration
- import locale
- encoding_ = locale.getpreferredencoding()
- write("<?xml version='1.0' encoding='%s'?>\n" % encoding_)
+ if method == "xml" and (xml_declaration or
+ (xml_declaration is None and
+ encoding not in ("utf-8", "us-ascii", "unicode"))):
+ declared_encoding = encoding
+ if encoding == "unicode":
+ # Retrieve the default encoding for the xml declaration
+ import locale
+ declared_encoding = locale.getpreferredencoding()
+ write("<?xml version='1.0' encoding='%s'?>\n" % declared_encoding)
if method == "text":
_serialize_text(write, self._root)
else:
@@ -1127,11 +1132,12 @@ def _escape_attrib_html(text):
##
# Generates a string representation of an XML element, including all
-# subelements. If encoding is None, the return type is a string;
+# subelements. If encoding is "unicode", the return type is a string;
# otherwise it is a bytes array.
#
# @param element An Element instance.
-# @keyparam encoding Optional output encoding (default is None).
+# @keyparam encoding Optional output encoding (default is US-ASCII).
+# Use "unicode" to return a Unicode string.
# @keyparam method Optional output method ("xml", "html", "text" or
# "c14n"; default is "xml").
# @return An (optionally) encoded string containing the XML data.
@@ -1144,17 +1150,20 @@ def tostring(element, encoding=None, method=None):
file = dummy()
file.write = data.append
ElementTree(element).write(file, encoding, method=method)
- if encoding:
- return b"".join(data)
- else:
+ if encoding in (str, "unicode"):
return "".join(data)
+ else:
+ return b"".join(data)
##
# Generates a string representation of an XML element, including all
-# subelements. The string is returned as a sequence of string fragments.
+# subelements. If encoding is False, the string is returned as a
+# sequence of string fragments; otherwise it is a sequence of
+# bytestrings.
#
# @param element An Element instance.
# @keyparam encoding Optional output encoding (default is US-ASCII).
+# Use "unicode" to return a Unicode string.
# @keyparam method Optional output method ("xml", "html", "text" or
# "c14n"; default is "xml").
# @return A sequence object containing the XML data.
@@ -1184,7 +1193,7 @@ def dump(elem):
# debugging
if not isinstance(elem, ElementTree):
elem = ElementTree(elem)
- elem.write(sys.stdout)
+ elem.write(sys.stdout, encoding="unicode")
tail = elem.getroot().tail
if not tail or tail[-1] != "\n":
sys.stdout.write("\n")
diff --git a/Misc/NEWS b/Misc/NEWS
index 644bd9e..294382c 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -55,6 +55,9 @@ Extensions
Library
-------
+- Issue #8047: Fix the xml.etree serializer to return bytes by default. Use
+ ``encoding="unicode"`` to generate a Unicode string.
+
- Fix Issue8280 - urllib2's Request method will remove fragements in the url.
This is how it is supposed to work, wget and curl do the same. Previous
behavior was wrong.