diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-05-11 06:31:07 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-11 06:31:07 (GMT) |
commit | 707839b0fe02ba2c891a40f40e7a869d84c2c9c5 (patch) | |
tree | 862f7d27088ea23ecaa6da2a53b028b388ea1d9f | |
parent | 75e463430efcb5b20efa93f9a5d98ccd03d83a3d (diff) | |
download | cpython-707839b0fe02ba2c891a40f40e7a869d84c2c9c5.zip cpython-707839b0fe02ba2c891a40f40e7a869d84c2c9c5.tar.gz cpython-707839b0fe02ba2c891a40f40e7a869d84c2c9c5.tar.bz2 |
gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903)
ElementTree method write() and function tostring() now use the text file's
encoding ("UTF-8" if not available) instead of locale encoding in XML
declaration when encoding="unicode" is specified.
-rw-r--r-- | Lib/test/test_xml_etree.py | 31 | ||||
-rw-r--r-- | Lib/xml/etree/ElementTree.py | 23 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst | 5 |
3 files changed, 29 insertions, 30 deletions
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index db25eab..aea77b1 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -10,7 +10,6 @@ import functools import html import io import itertools -import locale import operator import os import pickle @@ -978,15 +977,13 @@ class ElementTreeTest(unittest.TestCase): def test_tostring_xml_declaration_unicode_encoding(self): elem = ET.XML('<body><tag/></body>') - preferredencoding = locale.getpreferredencoding() self.assertEqual( - f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>", - ET.tostring(elem, encoding='unicode', xml_declaration=True) + ET.tostring(elem, encoding='unicode', xml_declaration=True), + "<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>" ) def test_tostring_xml_declaration_cases(self): elem = ET.XML('<body><tag>ø</tag></body>') - preferredencoding = locale.getpreferredencoding() TESTCASES = [ # (expected_retval, encoding, xml_declaration) # ... xml_declaration = None @@ -1013,7 +1010,7 @@ class ElementTreeTest(unittest.TestCase): b"<body><tag>ø</tag></body>", 'US-ASCII', True), (b"<?xml version='1.0' encoding='ISO-8859-1'?>\n" b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True), - (f"<?xml version='1.0' encoding='{preferredencoding}'?>\n" + ("<?xml version='1.0' encoding='utf-8'?>\n" "<body><tag>ø</tag></body>", 'unicode', True), ] @@ -1051,11 +1048,10 @@ class ElementTreeTest(unittest.TestCase): b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>" ) - preferredencoding = locale.getpreferredencoding() stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True) self.assertEqual( ''.join(stringlist), - f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>" + "<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>" ) self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>") self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:]) @@ -3740,17 +3736,16 @@ class IOTest(unittest.TestCase): encoding = f.encoding os_helper.unlink(TESTFN) - try: - '\xf8'.encode(encoding) - except UnicodeEncodeError: - self.skipTest(f'default file encoding {encoding} not supported') - tree = ET.ElementTree(ET.XML('''<site>\xf8</site>''')) tree.write(TESTFN, encoding='unicode') with open(TESTFN, 'rb') as f: data = f.read() expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace') - self.assertEqual(data, expected) + if encoding.lower() in ('utf-8', 'ascii'): + self.assertEqual(data, expected) + else: + self.assertIn(b"<?xml version='1.0' encoding=", data) + self.assertIn(expected, data) def test_write_to_text_file(self): self.addCleanup(os_helper.unlink, TESTFN) @@ -3765,13 +3760,17 @@ class IOTest(unittest.TestCase): tree.write(f, encoding='unicode') self.assertFalse(f.closed) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), b'''<site>ø</site>''') + self.assertEqual(f.read(), convlinesep( + b'''<?xml version='1.0' encoding='ascii'?>\n''' + b'''<site>ø</site>''')) with open(TESTFN, 'w', encoding='ISO-8859-1') as f: tree.write(f, encoding='unicode') self.assertFalse(f.closed) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), b'''<site>\xf8</site>''') + self.assertEqual(f.read(), convlinesep( + b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n''' + b'''<site>\xf8</site>''')) def test_write_to_binary_file(self): self.addCleanup(os_helper.unlink, TESTFN) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 5249c7a..a5cc65e 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -728,16 +728,10 @@ class ElementTree: encoding = "utf-8" else: encoding = "us-ascii" - enc_lower = encoding.lower() - with _get_writer(file_or_filename, enc_lower) as write: + with _get_writer(file_or_filename, encoding) as (write, declared_encoding): if method == "xml" and (xml_declaration or (xml_declaration is None and - enc_lower not in ("utf-8", "us-ascii", "unicode"))): - declared_encoding = encoding - if enc_lower == "unicode": - # Retrieve the default encoding for the xml declaration - import locale - declared_encoding = locale.getpreferredencoding() + declared_encoding.lower() not in ("utf-8", "us-ascii"))): write("<?xml version='1.0' encoding='%s'?>\n" % ( declared_encoding,)) if method == "text": @@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding): write = file_or_filename.write except AttributeError: # file_or_filename is a file name - if encoding == "unicode": - file = open(file_or_filename, "w") + if encoding.lower() == "unicode": + file = open(file_or_filename, "w", + errors="xmlcharrefreplace") else: file = open(file_or_filename, "w", encoding=encoding, errors="xmlcharrefreplace") with file: - yield file.write + yield file.write, file.encoding else: # file_or_filename is a file-like object # encoding determines if it is a text or binary writer - if encoding == "unicode": + if encoding.lower() == "unicode": # use a text writer as is - yield write + yield write, getattr(file_or_filename, "encoding", None) or "utf-8" else: # wrap a binary writer with TextIOWrapper with contextlib.ExitStack() as stack: @@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding): # Keep the original file open when the TextIOWrapper is # destroyed stack.callback(file.detach) - yield file.write + yield file.write, encoding def _namespaces(elem, default_namespace=None): # identify namespaces used in this tree diff --git a/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst new file mode 100644 index 0000000..0711f84 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst @@ -0,0 +1,5 @@ +:class:`~xml.etree.ElementTree.ElementTree` method +:meth:`~xml.etree.ElementTree.ElementTree.write` and function +:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding +("UTF-8" if not available) instead of locale encoding in XML declaration +when ``encoding="unicode"`` is specified. |