diff options
author | Florent Xicluna <florent.xicluna@gmail.com> | 2010-08-08 19:48:29 (GMT) |
---|---|---|
committer | Florent Xicluna <florent.xicluna@gmail.com> | 2010-08-08 19:48:29 (GMT) |
commit | c17f17294fb31967b52b5b7039d7587e0f82c3bb (patch) | |
tree | 8d7021b079f787e45382391cd3327a0127f2923d | |
parent | 1a0a737b131b5c54ba32a8dc707db39b4ce03674 (diff) | |
download | cpython-c17f17294fb31967b52b5b7039d7587e0f82c3bb.zip cpython-c17f17294fb31967b52b5b7039d7587e0f82c3bb.tar.gz cpython-c17f17294fb31967b52b5b7039d7587e0f82c3bb.tar.bz2 |
Issue #8047: Fix the xml.etree serializer to return bytes by default.
Use ``encoding="unicode"`` to generate a Unicode string.
-rw-r--r-- | Doc/library/xml.etree.elementtree.rst | 22 | ||||
-rw-r--r-- | Lib/test/test_xml_etree.py | 46 | ||||
-rw-r--r-- | Lib/xml/etree/ElementTree.py | 59 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
4 files changed, 73 insertions, 57 deletions
diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst index 892a59a..32057d4 100644 --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -148,20 +148,22 @@ Functions arguments. Returns an element instance. -.. function:: tostring(element, encoding=None, method="xml") +.. function:: tostring(element, encoding="us-ascii", method="xml") Generates a string representation of an XML element, including all subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is - the output encoding (default is None). *method* is either ``"xml"``, + the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to + generate a Unicode string. *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). Returns an (optionally) encoded string containing the XML data. -.. function:: tostringlist(element, encoding=None, method="xml") +.. function:: tostringlist(element, encoding="us-ascii", method="xml") Generates a string representation of an XML element, including all subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is - the output encoding (default is None). *method* is either ``"xml"``, + the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to + generate a Unicode string. *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). Returns a list of (optionally) encoded strings containing the XML data. It does not guarantee any specific sequence, except that ``"".join(tostringlist(element)) == @@ -430,6 +432,7 @@ ElementTree Objects .. method:: getroot() + Returns the root element for this tree. @@ -457,15 +460,16 @@ ElementTree Objects root element. - .. method:: write(file, encoding=None, xml_declaration=None, method="xml") + .. method:: write(file, encoding="us-ascii", xml_declaration=None, method="xml") Writes the element tree to a file, as XML. *file* is a file name, or a file object opened for writing. *encoding* [1]_ is the output encoding - (default is None). *xml_declaration* controls if an XML declaration + (default is US-ASCII). Use ``encoding="unicode"`` to write a Unicode string. + *xml_declaration* controls if an XML declaration should be added to the file. Use False for never, True for always, None - for only if not US-ASCII or UTF-8 (default is None). *method* is either - ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). Returns an - (optionally) encoded string. + for only if not US-ASCII or UTF-8 or Unicode (default is None). *method* is + either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). + Returns an (optionally) encoded string. This is the XML file that is going to be manipulated:: diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index e7c8a89..0dc400f 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -71,14 +71,14 @@ def check_method(method): if not hasattr(method, '__call__'): print(method, "not callable") -def serialize(elem, to_string=True, **options): +def serialize(elem, to_string=True, encoding='unicode', **options): import io - if options.get("encoding"): + if encoding != 'unicode': file = io.BytesIO() else: file = io.StringIO() tree = ET.ElementTree(elem) - tree.write(file, **options) + tree.write(file, encoding=encoding, **options) if to_string: return file.getvalue() else: @@ -537,7 +537,7 @@ def attrib(): >>> elem.set('testa', 'testval') >>> elem.set('testb', 'test2') >>> ET.tostring(elem) - '<test testa="testval" testb="test2">aa</test>' + b'<test testa="testval" testb="test2">aa</test>' >>> sorted(elem.keys()) ['testa', 'testb'] >>> sorted(elem.items()) @@ -547,7 +547,7 @@ def attrib(): >>> elem.attrib['testb'] = 'test1' >>> elem.attrib['testc'] = 'test2' >>> ET.tostring(elem) - '<test testa="testval" testb="test1" testc="test2">aa</test>' + b'<test testa="testval" testb="test1" testc="test2">aa</test>' """ def makeelement(): @@ -587,7 +587,7 @@ def parsefile(): >>> tree = ET.parse(SIMPLE_XMLFILE) >>> normalize_crlf(tree) - >>> tree.write(sys.stdout) + >>> tree.write(sys.stdout, encoding='unicode') <root> <element key="value">text</element> <element>text</element>tail @@ -595,7 +595,7 @@ def parsefile(): </root> >>> tree = ET.parse(SIMPLE_NS_XMLFILE) >>> normalize_crlf(tree) - >>> tree.write(sys.stdout) + >>> tree.write(sys.stdout, encoding='unicode') <ns0:root xmlns:ns0="namespace"> <ns0:element key="value">text</ns0:element> <ns0:element>text</ns0:element>tail @@ -636,17 +636,17 @@ def parsefile(): def parseliteral(): """ >>> element = ET.XML("<html><body>text</body></html>") - >>> ET.ElementTree(element).write(sys.stdout) + >>> ET.ElementTree(element).write(sys.stdout, encoding='unicode') <html><body>text</body></html> >>> element = ET.fromstring("<html><body>text</body></html>") - >>> ET.ElementTree(element).write(sys.stdout) + >>> ET.ElementTree(element).write(sys.stdout, encoding='unicode') <html><body>text</body></html> >>> sequence = ["<html><body>", "text</bo", "dy></html>"] >>> element = ET.fromstringlist(sequence) >>> print(ET.tostring(element)) - <html><body>text</body></html> - >>> print("".join(ET.tostringlist(element))) - <html><body>text</body></html> + b'<html><body>text</body></html>' + >>> print(b"".join(ET.tostringlist(element))) + b'<html><body>text</body></html>' >>> ET.tostring(element, "ascii") b"<?xml version='1.0' encoding='ascii'?>\\n<html><body>text</body></html>" >>> _, ids = ET.XMLID("<html><body>text</body></html>") @@ -875,10 +875,10 @@ def writestring(): """ >>> elem = ET.XML("<html><body>text</body></html>") >>> ET.tostring(elem) - '<html><body>text</body></html>' + b'<html><body>text</body></html>' >>> elem = ET.fromstring("<html><body>text</body></html>") >>> ET.tostring(elem) - '<html><body>text</body></html>' + b'<html><body>text</body></html>' """ def check_encoding(encoding): @@ -1233,14 +1233,14 @@ def processinginstruction(): Test ProcessingInstruction directly >>> ET.tostring(ET.ProcessingInstruction('test', 'instruction')) - '<?test instruction?>' + b'<?test instruction?>' >>> ET.tostring(ET.PI('test', 'instruction')) - '<?test instruction?>' + b'<?test instruction?>' Issue #2746 >>> ET.tostring(ET.PI('test', '<testing&>')) - '<?test <testing&>?>' + b'<?test <testing&>?>' >>> ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin1') b"<?xml version='1.0' encoding='latin1'?>\\n<?test <testing&>\\xe3?>" """ @@ -1643,11 +1643,11 @@ def bug_200708_newline(): >>> e = ET.Element('SomeTag', text="def _f():\n return 3\n") >>> ET.tostring(e) - '<SomeTag text="def _f(): return 3 " />' + b'<SomeTag text="def _f(): return 3 " />' >>> ET.XML(ET.tostring(e)).get("text") 'def _f():\n return 3\n' >>> ET.tostring(ET.XML(ET.tostring(e))) - '<SomeTag text="def _f(): return 3 " />' + b'<SomeTag text="def _f(): return 3 " />' """ @@ -1698,15 +1698,15 @@ def bug_200709_register_namespace(): """ >>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title")) - '<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />' + b'<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />' >>> ET.register_namespace("foo", "http://namespace.invalid/does/not/exist/") >>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title")) - '<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />' + b'<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />' And the Dublin Core namespace is in the default list: >>> ET.tostring(ET.Element("{http://purl.org/dc/elements/1.1/}title")) - '<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />' + b'<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />' """ @@ -1792,7 +1792,7 @@ def check_issue3151(): '{${stuff}}localname' >>> t = ET.ElementTree(e) >>> ET.tostring(e) - '<ns0:localname xmlns:ns0="${stuff}" />' + b'<ns0:localname xmlns:ns0="${stuff}" />' """ diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index c26a764..ecc8ea7 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -792,12 +792,13 @@ class ElementTree: # @def write(file, **options) # @param file A file name, or a file object opened for writing. # @param **options Options, given as keyword arguments. - # @keyparam encoding Optional output encoding (default is None). + # @keyparam encoding Optional output encoding (default is US-ASCII). + # Use "unicode" to return a Unicode string. # @keyparam method Optional output method ("xml", "html", "text" or # "c14n"; default is "xml"). # @keyparam xml_declaration Controls if an XML declaration should # be added to the file. Use False for never, True for always, - # None for only if not US-ASCII or UTF-8. None is default. + # None for only if not US-ASCII or UTF-8 or Unicode. None is default. def write(self, file_or_filename, # keyword arguments @@ -811,14 +812,23 @@ class ElementTree: elif method not in _serialize: # FIXME: raise an ImportError for c14n if ElementC14N is missing? raise ValueError("unknown method %r" % method) + if not encoding: + if method == "c14n": + encoding = "utf-8" + else: + encoding = "us-ascii" + elif encoding == str: # lxml.etree compatibility. + encoding = "unicode" + else: + encoding = encoding.lower() if hasattr(file_or_filename, "write"): file = file_or_filename else: - if encoding: + if encoding != "unicode": file = open(file_or_filename, "wb") else: file = open(file_or_filename, "w") - if encoding: + if encoding != "unicode": def write(text): try: return file.write(text.encode(encoding, @@ -827,20 +837,15 @@ class ElementTree: _raise_serialization_error(text) else: write = file.write - if not encoding: - if method == "c14n": - encoding = "utf-8" - else: - encoding = None - elif xml_declaration or (xml_declaration is None and - encoding not in ("utf-8", "us-ascii")): - if method == "xml": - encoding_ = encoding - if not encoding: - # Retrieve the default encoding for the xml declaration - import locale - encoding_ = locale.getpreferredencoding() - write("<?xml version='1.0' encoding='%s'?>\n" % encoding_) + if method == "xml" and (xml_declaration or + (xml_declaration is None and + encoding not in ("utf-8", "us-ascii", "unicode"))): + declared_encoding = encoding + if encoding == "unicode": + # Retrieve the default encoding for the xml declaration + import locale + declared_encoding = locale.getpreferredencoding() + write("<?xml version='1.0' encoding='%s'?>\n" % declared_encoding) if method == "text": _serialize_text(write, self._root) else: @@ -1127,11 +1132,12 @@ def _escape_attrib_html(text): ## # Generates a string representation of an XML element, including all -# subelements. If encoding is None, the return type is a string; +# subelements. If encoding is "unicode", the return type is a string; # otherwise it is a bytes array. # # @param element An Element instance. -# @keyparam encoding Optional output encoding (default is None). +# @keyparam encoding Optional output encoding (default is US-ASCII). +# Use "unicode" to return a Unicode string. # @keyparam method Optional output method ("xml", "html", "text" or # "c14n"; default is "xml"). # @return An (optionally) encoded string containing the XML data. @@ -1144,17 +1150,20 @@ def tostring(element, encoding=None, method=None): file = dummy() file.write = data.append ElementTree(element).write(file, encoding, method=method) - if encoding: - return b"".join(data) - else: + if encoding in (str, "unicode"): return "".join(data) + else: + return b"".join(data) ## # Generates a string representation of an XML element, including all -# subelements. The string is returned as a sequence of string fragments. +# subelements. If encoding is False, the string is returned as a +# sequence of string fragments; otherwise it is a sequence of +# bytestrings. # # @param element An Element instance. # @keyparam encoding Optional output encoding (default is US-ASCII). +# Use "unicode" to return a Unicode string. # @keyparam method Optional output method ("xml", "html", "text" or # "c14n"; default is "xml"). # @return A sequence object containing the XML data. @@ -1184,7 +1193,7 @@ def dump(elem): # debugging if not isinstance(elem, ElementTree): elem = ElementTree(elem) - elem.write(sys.stdout) + elem.write(sys.stdout, encoding="unicode") tail = elem.getroot().tail if not tail or tail[-1] != "\n": sys.stdout.write("\n") @@ -55,6 +55,9 @@ Extensions Library ------- +- Issue #8047: Fix the xml.etree serializer to return bytes by default. Use + ``encoding="unicode"`` to generate a Unicode string. + - Fix Issue8280 - urllib2's Request method will remove fragements in the url. This is how it is supposed to work, wget and curl do the same. Previous behavior was wrong. |