From 43536e9e373f395a047403831c08acedf3c5f258 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 4 Feb 2013 18:26:15 +0200 Subject: Issue #17089: Expat parser now correctly works with string input not only when an internal XML encoding is UTF-8 or US-ASCII. It now accepts bytes and strings larger than 2 GiB. --- Lib/test/test_pyexpat.py | 79 ++++++++++++++++++++++++++---------------------- Misc/NEWS | 4 +++ Modules/pyexpat.c | 43 +++++++++++++++++++++++--- 3 files changed, 86 insertions(+), 40 deletions(-) diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index 117bda0..7f93bfb 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -52,6 +52,7 @@ data = b'''\ &external_entity; &skipped_entity; +\xb5 ''' @@ -195,13 +196,13 @@ class ParseTest(unittest.TestCase): "End element: 'sub2'", "External entity ref: (None, 'entity.file', None)", ('Skipped entity', ('skipped_entity', 0)), + "Character data: '\xb5'", "End element: 'root'", ] for operation, expected_operation in zip(operations, expected_operations): self.assertEqual(operation, expected_operation) - def test_unicode(self): - # Try the parse again, this time producing Unicode output + def test_parse_bytes(self): out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') self._hookup_callbacks(parser, out) @@ -213,6 +214,16 @@ class ParseTest(unittest.TestCase): # Issue #6697. self.assertRaises(AttributeError, getattr, parser, '\uD800') + def test_parse_str(self): + out = self.Outputter() + parser = expat.ParserCreate(namespace_separator='!') + self._hookup_callbacks(parser, out) + + parser.Parse(data.decode('iso-8859-1'), 1) + + operations = out.out + self._verify_parse_output(operations) + def test_parse_file(self): # Try parsing a file out = self.Outputter() @@ -269,7 +280,7 @@ class InterningTest(unittest.TestCase): L.append(name) p.StartElementHandler = collector p.EndElementHandler = collector - p.Parse(" ", 1) + p.Parse(b" ", 1) tag = L[0] self.assertEqual(len(L), 6) for entry in L: @@ -285,7 +296,7 @@ class InterningTest(unittest.TestCase): def ExternalEntityRefHandler(self, context, base, sysId, pubId): external_parser = self.parser.ExternalEntityParserCreate("") - self.parser_result = external_parser.Parse("", 1) + self.parser_result = external_parser.Parse(b"", 1) return 1 parser = expat.ParserCreate(namespace_separator='!') @@ -336,7 +347,7 @@ class BufferTextTest(unittest.TestCase): def test_buffering_enabled(self): # Make sure buffering is turned on self.assertTrue(self.parser.buffer_text) - self.parser.Parse("123", 1) + self.parser.Parse(b"123", 1) self.assertEqual(self.stuff, ['123'], "buffered text not properly collapsed") @@ -344,39 +355,39 @@ class BufferTextTest(unittest.TestCase): # XXX This test exposes more detail of Expat's text chunking than we # XXX like, but it tests what we need to concisely. self.setHandlers(["StartElementHandler"]) - self.parser.Parse("12\n34\n5", 1) + self.parser.Parse(b"12\n34\n5", 1) self.assertEqual(self.stuff, ["", "1", "", "2", "\n", "3", "", "4\n5"], "buffering control not reacting as expected") def test2(self): - self.parser.Parse("1<2> \n 3", 1) + self.parser.Parse(b"1<2> \n 3", 1) self.assertEqual(self.stuff, ["1<2> \n 3"], "buffered text not properly collapsed") def test3(self): self.setHandlers(["StartElementHandler"]) - self.parser.Parse("123", 1) + self.parser.Parse(b"123", 1) self.assertEqual(self.stuff, ["", "1", "", "2", "", "3"], "buffered text not properly split") def test4(self): self.setHandlers(["StartElementHandler", "EndElementHandler"]) self.parser.CharacterDataHandler = None - self.parser.Parse("123", 1) + self.parser.Parse(b"123", 1) self.assertEqual(self.stuff, ["", "", "", "", "", ""]) def test5(self): self.setHandlers(["StartElementHandler", "EndElementHandler"]) - self.parser.Parse("123", 1) + self.parser.Parse(b"123", 1) self.assertEqual(self.stuff, ["", "1", "", "", "2", "", "", "3", ""]) def test6(self): self.setHandlers(["CommentHandler", "EndElementHandler", "StartElementHandler"]) - self.parser.Parse("12345 ", 1) + self.parser.Parse(b"12345 ", 1) self.assertEqual(self.stuff, ["", "1", "", "", "2", "", "", "345", ""], "buffered text not properly split") @@ -384,7 +395,7 @@ class BufferTextTest(unittest.TestCase): def test7(self): self.setHandlers(["CommentHandler", "EndElementHandler", "StartElementHandler"]) - self.parser.Parse("12345 ", 1) + self.parser.Parse(b"12345 ", 1) self.assertEqual(self.stuff, ["", "1", "", "", "2", "", "", "3", "", "4", "", "5", ""], @@ -400,7 +411,7 @@ class HandlerExceptionTest(unittest.TestCase): parser = expat.ParserCreate() parser.StartElementHandler = self.StartElementHandler try: - parser.Parse("", 1) + parser.Parse(b"", 1) self.fail() except RuntimeError as e: self.assertEqual(e.args[0], 'a', @@ -436,7 +447,7 @@ class PositionTest(unittest.TestCase): self.expected_list = [('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2), ('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)] - xml = '\n \n \n \n' + xml = b'\n \n \n \n' self.parser.Parse(xml, 1) @@ -457,7 +468,7 @@ class sf1296433Test(unittest.TestCase): parser = expat.ParserCreate() parser.CharacterDataHandler = handler - self.assertRaises(Exception, parser.Parse, xml) + self.assertRaises(Exception, parser.Parse, xml.encode('iso8859')) class ChardataBufferTest(unittest.TestCase): """ @@ -480,8 +491,8 @@ class ChardataBufferTest(unittest.TestCase): self.assertRaises(ValueError, f, 0) def test_unchanged_size(self): - xml1 = ("%s" % ('a' * 512)) - xml2 = 'a'*512 + '' + xml1 = b"" + b'a' * 512 + xml2 = b'a'*512 + b'' parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_size = 512 @@ -503,9 +514,9 @@ class ChardataBufferTest(unittest.TestCase): def test_disabling_buffer(self): - xml1 = "%s" % ('a' * 512) - xml2 = ('b' * 1024) - xml3 = "%s" % ('c' * 1024) + xml1 = b"" + b'a' * 512 + xml2 = b'b' * 1024 + xml3 = b'c' * 1024 + b''; parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_text = 1 @@ -532,16 +543,11 @@ class ChardataBufferTest(unittest.TestCase): parser.Parse(xml3, 1) self.assertEqual(self.n, 12) - - - def make_document(self, bytes): - return ("" + bytes * 'a' + '') - def counting_handler(self, text): self.n += 1 def small_buffer_test(self, buffer_len): - xml = "%s" % ('a' * buffer_len) + xml = b"" + b'a' * buffer_len + b'' parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_size = 1024 @@ -552,8 +558,8 @@ class ChardataBufferTest(unittest.TestCase): return self.n def test_change_size_1(self): - xml1 = "%s" % ('a' * 1024) - xml2 = "aaa%s" % ('a' * 1025) + xml1 = b"" + b'a' * 1024 + xml2 = b'aaa' + b'a' * 1025 + b'' parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_text = 1 @@ -568,8 +574,8 @@ class ChardataBufferTest(unittest.TestCase): self.assertEqual(self.n, 2) def test_change_size_2(self): - xml1 = "a%s" % ('a' * 1023) - xml2 = "aaa%s" % ('a' * 1025) + xml1 = b"a" + b'a' * 1023 + xml2 = b'aaa' + b'a' * 1025 + b'' parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_text = 1 @@ -585,7 +591,7 @@ class ChardataBufferTest(unittest.TestCase): class MalformedInputTest(unittest.TestCase): def test1(self): - xml = "\0\r\n" + xml = b"\0\r\n" parser = expat.ParserCreate() try: parser.Parse(xml, True) @@ -594,7 +600,8 @@ class MalformedInputTest(unittest.TestCase): self.assertEqual(str(e), 'unclosed token: line 2, column 0') def test2(self): - xml = "\r\n" + # \xc2\x85 is UTF-8 encoded U+0085 (NEXT LINE) + xml = b"\r\n" parser = expat.ParserCreate() try: parser.Parse(xml, True) @@ -609,7 +616,7 @@ class ErrorMessageTest(unittest.TestCase): errors.messages[errors.codes[errors.XML_ERROR_SYNTAX]]) def test_expaterror(self): - xml = '<' + xml = b'<' parser = expat.ParserCreate() try: parser.Parse(xml, True) @@ -638,7 +645,7 @@ class ForeignDTDTests(unittest.TestCase): parser.UseForeignDTD(True) parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.ExternalEntityRefHandler = resolve_entity - parser.Parse("") + parser.Parse(b"") self.assertEqual(handler_call_args, [(None, None)]) # test UseForeignDTD() is equal to UseForeignDTD(True) @@ -648,7 +655,7 @@ class ForeignDTDTests(unittest.TestCase): parser.UseForeignDTD() parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.ExternalEntityRefHandler = resolve_entity - parser.Parse("") + parser.Parse(b"") self.assertEqual(handler_call_args, [(None, None)]) def test_ignore_use_foreign_dtd(self): @@ -667,7 +674,7 @@ class ForeignDTDTests(unittest.TestCase): parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.ExternalEntityRefHandler = resolve_entity parser.Parse( - "") + b"") self.assertEqual(handler_call_args, [("bar", "baz")]) diff --git a/Misc/NEWS b/Misc/NEWS index 7a04599..07a33ce 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -212,6 +212,10 @@ Core and Builtins Library ------- +- Issue #17089: Expat parser now correctly works with string input not only when + an internal XML encoding is UTF-8 or US-ASCII. It now accepts bytes and + strings larger than 2 GiB. + - Issue #16903: Popen.communicate() on Unix now accepts strings when universal_newlines is true as on Windows. diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 4b9687a..9d22d3a 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -777,17 +777,52 @@ PyDoc_STRVAR(xmlparse_Parse__doc__, "Parse(data[, isfinal])\n\ Parse XML data. `isfinal' should be true at end of input."); +#define MAX_CHUNK_SIZE (1 << 20) + static PyObject * xmlparse_Parse(xmlparseobject *self, PyObject *args) { - char *s; - int slen; + PyObject *data; int isFinal = 0; + const char *s; + Py_ssize_t slen; + Py_buffer view; + int rc; - if (!PyArg_ParseTuple(args, "s#|i:Parse", &s, &slen, &isFinal)) + if (!PyArg_ParseTuple(args, "O|i:Parse", &data, &isFinal)) return NULL; - return get_parse_result(self, XML_Parse(self->itself, s, slen, isFinal)); + if (PyUnicode_Check(data)) { + PyObject *bytes; + bytes = PyUnicode_AsUTF8String(data); + if (bytes == NULL) + return NULL; + view.buf = NULL; + s = PyBytes_AS_STRING(bytes); + slen = PyBytes_GET_SIZE(bytes); + /* Explicitly set UTF-8 encoding. Return code ignored. */ + (void)XML_SetEncoding(self->itself, "utf-8"); + } + else { + if (PyObject_GetBuffer(data, &view, PyBUF_SIMPLE) < 0) + return NULL; + s = view.buf; + slen = view.len; + } + + while (slen > MAX_CHUNK_SIZE) { + rc = XML_Parse(self->itself, s, MAX_CHUNK_SIZE, 0); + if (!rc) + goto done; + s += MAX_CHUNK_SIZE; + slen -= MAX_CHUNK_SIZE; + } + rc = XML_Parse(self->itself, s, slen, isFinal); + +done: + if (view.buf != NULL) + PyBuffer_Release(&view); + return get_parse_result(self, rc); } /* File reading copied from cPickle */ -- cgit v0.12