summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2013-05-22 14:21:06 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2013-05-22 14:21:06 (GMT)
commit447b6e3c6ece7db5eb056311b473285366f6cadb (patch)
tree7b06654559cee8c33ea5bfb4db2bb3a69d1aaefe
parent43e145b6a47a85ed06e0e719238f3fc0e3fd9eef (diff)
parent66d53fa9ad846a401292eec622a6a98983bed578 (diff)
downloadcpython-447b6e3c6ece7db5eb056311b473285366f6cadb.zip
cpython-447b6e3c6ece7db5eb056311b473285366f6cadb.tar.gz
cpython-447b6e3c6ece7db5eb056311b473285366f6cadb.tar.bz2
Issue #16986: ElementTree now correctly parses a string input not only when
an internal XML encoding is UTF-8 or US-ASCII.
-rw-r--r--Include/pyexpat.h1
-rw-r--r--Lib/test/test_xml_etree.py45
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/_elementtree.c39
-rw-r--r--Modules/pyexpat.c1
5 files changed, 65 insertions, 24 deletions
diff --git a/Include/pyexpat.h b/Include/pyexpat.h
index 168b5b2..8a79974 100644
--- a/Include/pyexpat.h
+++ b/Include/pyexpat.h
@@ -45,6 +45,7 @@ struct PyExpat_CAPI
void (*SetUserData)(XML_Parser parser, void *userData);
void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
XML_StartDoctypeDeclHandler start);
+ enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
/* always add new stuff to the end! */
};
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 9d61ed7..2ea0058 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -668,15 +668,18 @@ class ElementTreeTest(unittest.TestCase):
elem = ET.fromstring("<html><body>text</body></html>")
self.assertEqual(ET.tostring(elem), b'<html><body>text</body></html>')
- def test_encoding(encoding):
- def check(encoding):
- ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding)
- check("ascii")
- check("us-ascii")
- check("iso-8859-1")
- check("iso-8859-15")
- check("cp437")
- check("mac-roman")
+ def test_encoding(self):
+ def check(encoding, body=''):
+ xml = ("<?xml version='1.0' encoding='%s'?><xml>%s</xml>" %
+ (encoding, body))
+ self.assertEqual(ET.XML(xml.encode(encoding)).text, body)
+ self.assertEqual(ET.XML(xml).text, body)
+ check("ascii", 'a')
+ check("us-ascii", 'a')
+ check("iso-8859-1", '\xbd')
+ check("iso-8859-15", '\u20ac')
+ check("cp437", '\u221a')
+ check("mac-roman", '\u02da')
def test_methods(self):
# Test serialization methods.
@@ -2002,11 +2005,13 @@ class TreeBuilderTest(unittest.TestCase):
class XMLParserTest(unittest.TestCase):
- sample1 = '<file><line>22</line></file>'
- sample2 = ('<!DOCTYPE html PUBLIC'
- ' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
- ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
- '<html>text</html>')
+ sample1 = b'<file><line>22</line></file>'
+ sample2 = (b'<!DOCTYPE html PUBLIC'
+ b' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
+ b' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
+ b'<html>text</html>')
+ sample3 = ('<?xml version="1.0" encoding="iso-8859-1"?>\n'
+ '<money value="$\xa3\u20ac\U0001017b">$\xa3\u20ac\U0001017b</money>')
def _check_sample_element(self, e):
self.assertEqual(e.tag, 'file')
@@ -2042,12 +2047,21 @@ class XMLParserTest(unittest.TestCase):
_doctype = (name, pubid, system)
parser = MyParserWithDoctype()
- parser.feed(self.sample2)
+ with self.assertWarns(DeprecationWarning):
+ parser.feed(self.sample2)
parser.close()
self.assertEqual(_doctype,
('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'))
+ def test_parse_string(self):
+ parser = ET.XMLParser(target=ET.TreeBuilder())
+ parser.feed(self.sample3)
+ e = parser.close()
+ self.assertEqual(e.tag, 'money')
+ self.assertEqual(e.attrib['value'], '$\xa3\u20ac\U0001017b')
+ self.assertEqual(e.text, '$\xa3\u20ac\U0001017b')
+
class NamespaceParseTest(unittest.TestCase):
def test_find_with_namespace(self):
@@ -2473,6 +2487,7 @@ def test_main(module=None):
ElementFindTest,
ElementIterTest,
TreeBuilderTest,
+ XMLParserTest,
BugsTest,
]
diff --git a/Misc/NEWS b/Misc/NEWS
index 8bd5e0b..d56987d 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -96,6 +96,9 @@ Core and Builtins
Library
-------
+- Issue #16986: ElementTree now correctly parses a string input not only when
+ an internal XML encoding is UTF-8 or US-ASCII.
+
- Issue #17996: socket module now exposes AF_LINK constant on BSD and OSX.
- Issue #17900: Allowed pickling of recursive OrderedDicts. Decreased pickled
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
index c53f5ee..0d86886 100644
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -3288,7 +3288,7 @@ xmlparser_dealloc(XMLParserObject* self)
}
LOCAL(PyObject*)
-expat_parse(XMLParserObject* self, char* data, int data_len, int final)
+expat_parse(XMLParserObject* self, const char* data, int data_len, int final)
{
int ok;
@@ -3334,16 +3334,37 @@ xmlparser_close(XMLParserObject* self, PyObject* args)
}
static PyObject*
-xmlparser_feed(XMLParserObject* self, PyObject* args)
+xmlparser_feed(XMLParserObject* self, PyObject* arg)
{
/* feed data to parser */
- char* data;
- int data_len;
- if (!PyArg_ParseTuple(args, "s#:feed", &data, &data_len))
- return NULL;
-
- return expat_parse(self, data, data_len, 0);
+ if (PyUnicode_Check(arg)) {
+ Py_ssize_t data_len;
+ const char *data = PyUnicode_AsUTF8AndSize(arg, &data_len);
+ if (data == NULL)
+ return NULL;
+ if (data_len > INT_MAX) {
+ PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
+ return NULL;
+ }
+ /* Explicitly set UTF-8 encoding. Return code ignored. */
+ (void)EXPAT(SetEncoding)(self->parser, "utf-8");
+ return expat_parse(self, data, (int)data_len, 0);
+ }
+ else {
+ Py_buffer view;
+ PyObject *res;
+ if (PyObject_GetBuffer(arg, &view, PyBUF_SIMPLE) < 0)
+ return NULL;
+ if (view.len > INT_MAX) {
+ PyBuffer_Release(&view);
+ PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
+ return NULL;
+ }
+ res = expat_parse(self, view.buf, (int)view.len, 0);
+ PyBuffer_Release(&view);
+ return res;
+ }
}
static PyObject*
@@ -3523,7 +3544,7 @@ xmlparser_setevents(XMLParserObject *self, PyObject* args)
}
static PyMethodDef xmlparser_methods[] = {
- {"feed", (PyCFunction) xmlparser_feed, METH_VARARGS},
+ {"feed", (PyCFunction) xmlparser_feed, METH_O},
{"close", (PyCFunction) xmlparser_close, METH_VARARGS},
{"_parse_whole", (PyCFunction) xmlparser_parse_whole, METH_VARARGS},
{"_setevents", (PyCFunction) xmlparser_setevents, METH_VARARGS},
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 022b0cb..4750225 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -1937,6 +1937,7 @@ MODULE_INITFUNC(void)
capi.SetUnknownEncodingHandler = XML_SetUnknownEncodingHandler;
capi.SetUserData = XML_SetUserData;
capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
+ capi.SetEncoding = XML_SetEncoding;
/* export using capsule */
capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);