diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-05-22 14:21:06 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-05-22 14:21:06 (GMT) |
commit | 447b6e3c6ece7db5eb056311b473285366f6cadb (patch) | |
tree | 7b06654559cee8c33ea5bfb4db2bb3a69d1aaefe | |
parent | 43e145b6a47a85ed06e0e719238f3fc0e3fd9eef (diff) | |
parent | 66d53fa9ad846a401292eec622a6a98983bed578 (diff) | |
download | cpython-447b6e3c6ece7db5eb056311b473285366f6cadb.zip cpython-447b6e3c6ece7db5eb056311b473285366f6cadb.tar.gz cpython-447b6e3c6ece7db5eb056311b473285366f6cadb.tar.bz2 |
Issue #16986: ElementTree now correctly parses a string input not only when
an internal XML encoding is UTF-8 or US-ASCII.
-rw-r--r-- | Include/pyexpat.h | 1 | ||||
-rw-r--r-- | Lib/test/test_xml_etree.py | 45 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Modules/_elementtree.c | 39 | ||||
-rw-r--r-- | Modules/pyexpat.c | 1 |
5 files changed, 65 insertions, 24 deletions
diff --git a/Include/pyexpat.h b/Include/pyexpat.h index 168b5b2..8a79974 100644 --- a/Include/pyexpat.h +++ b/Include/pyexpat.h @@ -45,6 +45,7 @@ struct PyExpat_CAPI void (*SetUserData)(XML_Parser parser, void *userData); void (*SetStartDoctypeDeclHandler)(XML_Parser parser, XML_StartDoctypeDeclHandler start); + enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding); /* always add new stuff to the end! */ }; diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 9d61ed7..2ea0058 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -668,15 +668,18 @@ class ElementTreeTest(unittest.TestCase): elem = ET.fromstring("<html><body>text</body></html>") self.assertEqual(ET.tostring(elem), b'<html><body>text</body></html>') - def test_encoding(encoding): - def check(encoding): - ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding) - check("ascii") - check("us-ascii") - check("iso-8859-1") - check("iso-8859-15") - check("cp437") - check("mac-roman") + def test_encoding(self): + def check(encoding, body=''): + xml = ("<?xml version='1.0' encoding='%s'?><xml>%s</xml>" % + (encoding, body)) + self.assertEqual(ET.XML(xml.encode(encoding)).text, body) + self.assertEqual(ET.XML(xml).text, body) + check("ascii", 'a') + check("us-ascii", 'a') + check("iso-8859-1", '\xbd') + check("iso-8859-15", '\u20ac') + check("cp437", '\u221a') + check("mac-roman", '\u02da') def test_methods(self): # Test serialization methods. @@ -2002,11 +2005,13 @@ class TreeBuilderTest(unittest.TestCase): class XMLParserTest(unittest.TestCase): - sample1 = '<file><line>22</line></file>' - sample2 = ('<!DOCTYPE html PUBLIC' - ' "-//W3C//DTD XHTML 1.0 Transitional//EN"' - ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">' - '<html>text</html>') + sample1 = b'<file><line>22</line></file>' + sample2 = (b'<!DOCTYPE html PUBLIC' + b' "-//W3C//DTD XHTML 1.0 Transitional//EN"' + b' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">' + b'<html>text</html>') + sample3 = ('<?xml version="1.0" encoding="iso-8859-1"?>\n' + '<money value="$\xa3\u20ac\U0001017b">$\xa3\u20ac\U0001017b</money>') def _check_sample_element(self, e): self.assertEqual(e.tag, 'file') @@ -2042,12 +2047,21 @@ class XMLParserTest(unittest.TestCase): _doctype = (name, pubid, system) parser = MyParserWithDoctype() - parser.feed(self.sample2) + with self.assertWarns(DeprecationWarning): + parser.feed(self.sample2) parser.close() self.assertEqual(_doctype, ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd')) + def test_parse_string(self): + parser = ET.XMLParser(target=ET.TreeBuilder()) + parser.feed(self.sample3) + e = parser.close() + self.assertEqual(e.tag, 'money') + self.assertEqual(e.attrib['value'], '$\xa3\u20ac\U0001017b') + self.assertEqual(e.text, '$\xa3\u20ac\U0001017b') + class NamespaceParseTest(unittest.TestCase): def test_find_with_namespace(self): @@ -2473,6 +2487,7 @@ def test_main(module=None): ElementFindTest, ElementIterTest, TreeBuilderTest, + XMLParserTest, BugsTest, ] @@ -96,6 +96,9 @@ Core and Builtins Library ------- +- Issue #16986: ElementTree now correctly parses a string input not only when + an internal XML encoding is UTF-8 or US-ASCII. + - Issue #17996: socket module now exposes AF_LINK constant on BSD and OSX. - Issue #17900: Allowed pickling of recursive OrderedDicts. Decreased pickled diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index c53f5ee..0d86886 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -3288,7 +3288,7 @@ xmlparser_dealloc(XMLParserObject* self) } LOCAL(PyObject*) -expat_parse(XMLParserObject* self, char* data, int data_len, int final) +expat_parse(XMLParserObject* self, const char* data, int data_len, int final) { int ok; @@ -3334,16 +3334,37 @@ xmlparser_close(XMLParserObject* self, PyObject* args) } static PyObject* -xmlparser_feed(XMLParserObject* self, PyObject* args) +xmlparser_feed(XMLParserObject* self, PyObject* arg) { /* feed data to parser */ - char* data; - int data_len; - if (!PyArg_ParseTuple(args, "s#:feed", &data, &data_len)) - return NULL; - - return expat_parse(self, data, data_len, 0); + if (PyUnicode_Check(arg)) { + Py_ssize_t data_len; + const char *data = PyUnicode_AsUTF8AndSize(arg, &data_len); + if (data == NULL) + return NULL; + if (data_len > INT_MAX) { + PyErr_SetString(PyExc_OverflowError, "size does not fit in an int"); + return NULL; + } + /* Explicitly set UTF-8 encoding. Return code ignored. */ + (void)EXPAT(SetEncoding)(self->parser, "utf-8"); + return expat_parse(self, data, (int)data_len, 0); + } + else { + Py_buffer view; + PyObject *res; + if (PyObject_GetBuffer(arg, &view, PyBUF_SIMPLE) < 0) + return NULL; + if (view.len > INT_MAX) { + PyBuffer_Release(&view); + PyErr_SetString(PyExc_OverflowError, "size does not fit in an int"); + return NULL; + } + res = expat_parse(self, view.buf, (int)view.len, 0); + PyBuffer_Release(&view); + return res; + } } static PyObject* @@ -3523,7 +3544,7 @@ xmlparser_setevents(XMLParserObject *self, PyObject* args) } static PyMethodDef xmlparser_methods[] = { - {"feed", (PyCFunction) xmlparser_feed, METH_VARARGS}, + {"feed", (PyCFunction) xmlparser_feed, METH_O}, {"close", (PyCFunction) xmlparser_close, METH_VARARGS}, {"_parse_whole", (PyCFunction) xmlparser_parse_whole, METH_VARARGS}, {"_setevents", (PyCFunction) xmlparser_setevents, METH_VARARGS}, diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 022b0cb..4750225 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1937,6 +1937,7 @@ MODULE_INITFUNC(void) capi.SetUnknownEncodingHandler = XML_SetUnknownEncodingHandler; capi.SetUserData = XML_SetUserData; capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler; + capi.SetEncoding = XML_SetEncoding; /* export using capsule */ capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL); |