diff options
author | Eli Bendersky <eliben@gmail.com> | 2012-07-15 03:02:22 (GMT) |
---|---|---|
committer | Eli Bendersky <eliben@gmail.com> | 2012-07-15 03:02:22 (GMT) |
commit | 00f402bfcbe3245f9c62f86376fc77bb9e7de639 (patch) | |
tree | c5035e1c4af4be283479aca143ba687d74d19c0f /Lib/test/test_xml_etree.py | |
parent | 1191709b1379661a15287a2c6ac8263f23655f73 (diff) | |
download | cpython-00f402bfcbe3245f9c62f86376fc77bb9e7de639.zip cpython-00f402bfcbe3245f9c62f86376fc77bb9e7de639.tar.gz cpython-00f402bfcbe3245f9c62f86376fc77bb9e7de639.tar.bz2 |
Close #1767933: Badly formed XML using etree and utf-16. Patch by Serhiy Storchaka, with some minor fixes by me
Diffstat (limited to 'Lib/test/test_xml_etree.py')
-rw-r--r-- | Lib/test/test_xml_etree.py | 240 |
1 files changed, 175 insertions, 65 deletions
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index c1fc955..d90f978 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -21,7 +21,7 @@ import unittest import weakref from test import support -from test.support import findfile, import_fresh_module, gc_collect +from test.support import TESTFN, findfile, unlink, import_fresh_module, gc_collect pyET = None ET = None @@ -888,65 +888,6 @@ def check_encoding(encoding): """ ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding) -def encoding(): - r""" - Test encoding issues. - - >>> elem = ET.Element("tag") - >>> elem.text = "abc" - >>> serialize(elem) - '<tag>abc</tag>' - >>> serialize(elem, encoding="utf-8") - b'<tag>abc</tag>' - >>> serialize(elem, encoding="us-ascii") - b'<tag>abc</tag>' - >>> serialize(elem, encoding="iso-8859-1") - b"<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>abc</tag>" - - >>> elem.text = "<&\"\'>" - >>> serialize(elem) - '<tag><&"\'></tag>' - >>> serialize(elem, encoding="utf-8") - b'<tag><&"\'></tag>' - >>> serialize(elem, encoding="us-ascii") # cdata characters - b'<tag><&"\'></tag>' - >>> serialize(elem, encoding="iso-8859-1") - b'<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag><&"\'></tag>' - - >>> elem.attrib["key"] = "<&\"\'>" - >>> elem.text = None - >>> serialize(elem) - '<tag key="<&"\'>" />' - >>> serialize(elem, encoding="utf-8") - b'<tag key="<&"\'>" />' - >>> serialize(elem, encoding="us-ascii") - b'<tag key="<&"\'>" />' - >>> serialize(elem, encoding="iso-8859-1") - b'<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="<&"\'>" />' - - >>> elem.text = '\xe5\xf6\xf6<>' - >>> elem.attrib.clear() - >>> serialize(elem) - '<tag>\xe5\xf6\xf6<></tag>' - >>> serialize(elem, encoding="utf-8") - b'<tag>\xc3\xa5\xc3\xb6\xc3\xb6<></tag>' - >>> serialize(elem, encoding="us-ascii") - b'<tag>åöö<></tag>' - >>> serialize(elem, encoding="iso-8859-1") - b"<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>\xe5\xf6\xf6<></tag>" - - >>> elem.attrib["key"] = '\xe5\xf6\xf6<>' - >>> elem.text = None - >>> serialize(elem) - '<tag key="\xe5\xf6\xf6<>" />' - >>> serialize(elem, encoding="utf-8") - b'<tag key="\xc3\xa5\xc3\xb6\xc3\xb6<>" />' - >>> serialize(elem, encoding="us-ascii") - b'<tag key="åöö<>" />' - >>> serialize(elem, encoding="iso-8859-1") - b'<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="\xe5\xf6\xf6<>" />' - """ - def methods(): r""" Test serialization methods. @@ -2166,16 +2107,185 @@ class ElementSlicingTest(unittest.TestCase): self.assertEqual(self._subelem_tags(e), ['a1']) -class StringIOTest(unittest.TestCase): +class IOTest(unittest.TestCase): + def tearDown(self): + unlink(TESTFN) + + def test_encoding(self): + # Test encoding issues. + elem = ET.Element("tag") + elem.text = "abc" + self.assertEqual(serialize(elem), '<tag>abc</tag>') + self.assertEqual(serialize(elem, encoding="utf-8"), + b'<tag>abc</tag>') + self.assertEqual(serialize(elem, encoding="us-ascii"), + b'<tag>abc</tag>') + for enc in ("iso-8859-1", "utf-16", "utf-32"): + self.assertEqual(serialize(elem, encoding=enc), + ("<?xml version='1.0' encoding='%s'?>\n" + "<tag>abc</tag>" % enc).encode(enc)) + + elem = ET.Element("tag") + elem.text = "<&\"\'>" + self.assertEqual(serialize(elem), '<tag><&"\'></tag>') + self.assertEqual(serialize(elem, encoding="utf-8"), + b'<tag><&"\'></tag>') + self.assertEqual(serialize(elem, encoding="us-ascii"), + b'<tag><&"\'></tag>') + for enc in ("iso-8859-1", "utf-16", "utf-32"): + self.assertEqual(serialize(elem, encoding=enc), + ("<?xml version='1.0' encoding='%s'?>\n" + "<tag><&\"'></tag>" % enc).encode(enc)) + + elem = ET.Element("tag") + elem.attrib["key"] = "<&\"\'>" + self.assertEqual(serialize(elem), '<tag key="<&"\'>" />') + self.assertEqual(serialize(elem, encoding="utf-8"), + b'<tag key="<&"\'>" />') + self.assertEqual(serialize(elem, encoding="us-ascii"), + b'<tag key="<&"\'>" />') + for enc in ("iso-8859-1", "utf-16", "utf-32"): + self.assertEqual(serialize(elem, encoding=enc), + ("<?xml version='1.0' encoding='%s'?>\n" + "<tag key=\"<&"'>\" />" % enc).encode(enc)) + + elem = ET.Element("tag") + elem.text = '\xe5\xf6\xf6<>' + self.assertEqual(serialize(elem), '<tag>\xe5\xf6\xf6<></tag>') + self.assertEqual(serialize(elem, encoding="utf-8"), + b'<tag>\xc3\xa5\xc3\xb6\xc3\xb6<></tag>') + self.assertEqual(serialize(elem, encoding="us-ascii"), + b'<tag>åöö<></tag>') + for enc in ("iso-8859-1", "utf-16", "utf-32"): + self.assertEqual(serialize(elem, encoding=enc), + ("<?xml version='1.0' encoding='%s'?>\n" + "<tag>åöö<></tag>" % enc).encode(enc)) + + elem = ET.Element("tag") + elem.attrib["key"] = '\xe5\xf6\xf6<>' + self.assertEqual(serialize(elem), '<tag key="\xe5\xf6\xf6<>" />') + self.assertEqual(serialize(elem, encoding="utf-8"), + b'<tag key="\xc3\xa5\xc3\xb6\xc3\xb6<>" />') + self.assertEqual(serialize(elem, encoding="us-ascii"), + b'<tag key="åöö<>" />') + for enc in ("iso-8859-1", "utf-16", "utf-16le", "utf-16be", "utf-32"): + self.assertEqual(serialize(elem, encoding=enc), + ("<?xml version='1.0' encoding='%s'?>\n" + "<tag key=\"åöö<>\" />" % enc).encode(enc)) + + def test_write_to_filename(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + tree.write(TESTFN) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), b'''<site />''') + + def test_write_to_text_file(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + with open(TESTFN, 'w', encoding='utf-8') as f: + tree.write(f, encoding='unicode') + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), b'''<site />''') + + def test_write_to_binary_file(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + with open(TESTFN, 'wb') as f: + tree.write(f) + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), b'''<site />''') + + def test_write_to_binary_file_with_bom(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + # test BOM writing to buffered file + with open(TESTFN, 'wb') as f: + tree.write(f, encoding='utf-16') + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), + '''<?xml version='1.0' encoding='utf-16'?>\n''' + '''<site />'''.encode("utf-16")) + # test BOM writing to non-buffered file + with open(TESTFN, 'wb', buffering=0) as f: + tree.write(f, encoding='utf-16') + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), + '''<?xml version='1.0' encoding='utf-16'?>\n''' + '''<site />'''.encode("utf-16")) + def test_read_from_stringio(self): tree = ET.ElementTree() - stream = io.StringIO() - stream.write('''<?xml version="1.0"?><site></site>''') - stream.seek(0) + stream = io.StringIO('''<?xml version="1.0"?><site></site>''') tree.parse(stream) + self.assertEqual(tree.getroot().tag, 'site') + def test_write_to_stringio(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + stream = io.StringIO() + tree.write(stream, encoding='unicode') + self.assertEqual(stream.getvalue(), '''<site />''') + + def test_read_from_bytesio(self): + tree = ET.ElementTree() + raw = io.BytesIO(b'''<?xml version="1.0"?><site></site>''') + tree.parse(raw) + self.assertEqual(tree.getroot().tag, 'site') + + def test_write_to_bytesio(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + raw = io.BytesIO() + tree.write(raw) + self.assertEqual(raw.getvalue(), b'''<site />''') + + class dummy: + pass + + def test_read_from_user_text_reader(self): + stream = io.StringIO('''<?xml version="1.0"?><site></site>''') + reader = self.dummy() + reader.read = stream.read + tree = ET.ElementTree() + tree.parse(reader) self.assertEqual(tree.getroot().tag, 'site') + def test_write_to_user_text_writer(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + stream = io.StringIO() + writer = self.dummy() + writer.write = stream.write + tree.write(writer, encoding='unicode') + self.assertEqual(stream.getvalue(), '''<site />''') + + def test_read_from_user_binary_reader(self): + raw = io.BytesIO(b'''<?xml version="1.0"?><site></site>''') + reader = self.dummy() + reader.read = raw.read + tree = ET.ElementTree() + tree.parse(reader) + self.assertEqual(tree.getroot().tag, 'site') + tree = ET.ElementTree() + + def test_write_to_user_binary_writer(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + raw = io.BytesIO() + writer = self.dummy() + writer.write = raw.write + tree.write(writer) + self.assertEqual(raw.getvalue(), b'''<site />''') + + def test_write_to_user_binary_writer_with_bom(self): + tree = ET.ElementTree(ET.XML('''<site />''')) + raw = io.BytesIO() + writer = self.dummy() + writer.write = raw.write + writer.seekable = lambda: True + writer.tell = raw.tell + tree.write(writer, encoding="utf-16") + self.assertEqual(raw.getvalue(), + '''<?xml version='1.0' encoding='utf-16'?>\n''' + '''<site />'''.encode("utf-16")) + class ParseErrorTest(unittest.TestCase): def test_subclass(self): @@ -2299,7 +2409,7 @@ def test_main(module=None): test_classes = [ ElementSlicingTest, BasicElementTest, - StringIOTest, + IOTest, ParseErrorTest, XincludeTest, ElementTreeTest, |