summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Include/pyexpat.h4
-rw-r--r--Lib/test/test_xml_etree.py92
-rw-r--r--Modules/_elementtree.c43
-rw-r--r--Modules/pyexpat.c58
4 files changed, 123 insertions, 74 deletions
diff --git a/Include/pyexpat.h b/Include/pyexpat.h
index 8a79974..44259bf 100644
--- a/Include/pyexpat.h
+++ b/Include/pyexpat.h
@@ -6,7 +6,7 @@
#define PyExpat_CAPI_MAGIC "pyexpat.expat_CAPI 1.0"
#define PyExpat_CAPSULE_NAME "pyexpat.expat_CAPI"
-struct PyExpat_CAPI
+struct PyExpat_CAPI
{
char* magic; /* set to PyExpat_CAPI_MAGIC */
int size; /* set to sizeof(struct PyExpat_CAPI) */
@@ -46,6 +46,8 @@ struct PyExpat_CAPI
void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
XML_StartDoctypeDeclHandler start);
enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
+ int (*DefaultUnknownEncodingHandler)(
+ void *encodingHandlerData, const XML_Char *name, XML_Encoding *info);
/* always add new stuff to the end! */
};
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 2ea0058..2ec3322 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -681,6 +681,98 @@ class ElementTreeTest(unittest.TestCase):
check("cp437", '\u221a')
check("mac-roman", '\u02da')
+ def xml(encoding):
+ return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
+ def bxml(encoding):
+ return xml(encoding).encode(encoding)
+ supported_encodings = [
+ 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+ 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
+ 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
+ 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
+ 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
+ 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
+ 'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
+ 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
+ 'cp1257', 'cp1258',
+ 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
+ 'mac-roman', 'mac-turkish',
+ 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+ 'iso2022-jp-3', 'iso2022-jp-ext',
+ 'koi8-r', 'koi8-u',
+ 'hz', 'ptcp154',
+ ]
+ for encoding in supported_encodings:
+ self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+
+ unsupported_ascii_compatible_encodings = [
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
+ 'gb2312', 'gbk', 'gb18030',
+ 'iso2022-kr', 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+ 'utf-7',
+ ]
+ for encoding in unsupported_ascii_compatible_encodings:
+ self.assertRaises(ValueError, ET.XML, bxml(encoding))
+
+ unsupported_ascii_incompatible_encodings = [
+ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+ 'utf_32', 'utf_32_be', 'utf_32_le',
+ ]
+ for encoding in unsupported_ascii_incompatible_encodings:
+ self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+
+ self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
+ self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
+ def xml(encoding):
+ return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
+ def bxml(encoding):
+ return xml(encoding).encode(encoding)
+ supported_encodings = [
+ 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+ 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
+ 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
+ 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
+ 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
+ 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
+ 'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
+ 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
+ 'cp1257', 'cp1258',
+ 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
+ 'mac-roman', 'mac-turkish',
+ 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+ 'iso2022-jp-3', 'iso2022-jp-ext',
+ 'koi8-r', 'koi8-u',
+ 'hz', 'ptcp154',
+ ]
+ for encoding in supported_encodings:
+ self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+
+ unsupported_ascii_compatible_encodings = [
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
+ 'gb2312', 'gbk', 'gb18030',
+ 'iso2022-kr', 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+ 'utf-7',
+ ]
+ for encoding in unsupported_ascii_compatible_encodings:
+ self.assertRaises(ValueError, ET.XML, bxml(encoding))
+
+ unsupported_ascii_incompatible_encodings = [
+ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+ 'utf_32', 'utf_32_be', 'utf_32_le',
+ ]
+ for encoding in unsupported_ascii_incompatible_encodings:
+ self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+
+ self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
+ self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
def test_methods(self):
# Test serialization methods.
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
index 0d86886..377e88e 100644
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -3094,47 +3094,6 @@ expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
}
}
-static int
-expat_unknown_encoding_handler(XMLParserObject *self, const XML_Char *name,
- XML_Encoding *info)
-{
- PyObject* u;
- unsigned char s[256];
- int i;
- void *data;
- unsigned int kind;
-
- memset(info, 0, sizeof(XML_Encoding));
-
- for (i = 0; i < 256; i++)
- s[i] = i;
-
- u = PyUnicode_Decode((char*) s, 256, name, "replace");
- if (!u)
- return XML_STATUS_ERROR;
- if (PyUnicode_READY(u))
- return XML_STATUS_ERROR;
-
- if (PyUnicode_GET_LENGTH(u) != 256) {
- Py_DECREF(u);
- return XML_STATUS_ERROR;
- }
-
- kind = PyUnicode_KIND(u);
- data = PyUnicode_DATA(u);
- for (i = 0; i < 256; i++) {
- Py_UCS4 ch = PyUnicode_READ(kind, data, i);
- if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
- info->map[i] = ch;
- else
- info->map[i] = -1;
- }
-
- Py_DECREF(u);
-
- return XML_STATUS_OK;
-}
-
/* -------------------------------------------------------------------- */
static PyObject *
@@ -3236,7 +3195,7 @@ xmlparser_init(PyObject *self, PyObject *args, PyObject *kwds)
);
EXPAT(SetUnknownEncodingHandler)(
self_xp->parser,
- (XML_UnknownEncodingHandler) expat_unknown_encoding_handler, NULL
+ EXPAT(DefaultUnknownEncodingHandler), NULL
);
return 0;
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 4750225..01ac14e 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -1111,53 +1111,49 @@ static struct PyMethodDef xmlparse_methods[] = {
Make it as simple as possible.
*/
-static char template_buffer[257];
-
-static void
-init_template_buffer(void)
-{
- int i;
- for (i = 0; i < 256; i++) {
- template_buffer[i] = i;
- }
- template_buffer[256] = 0;
-}
-
static int
PyUnknownEncodingHandler(void *encodingHandlerData,
const XML_Char *name,
XML_Encoding *info)
{
- PyUnicodeObject *_u_string = NULL;
- int result = 0;
+ static unsigned char template_buffer[256] = {0};
+ PyObject* u;
int i;
- int kind;
void *data;
+ unsigned int kind;
- /* Yes, supports only 8bit encodings */
- _u_string = (PyUnicodeObject *)
- PyUnicode_Decode(template_buffer, 256, name, "replace");
+ if (template_buffer[1] == 0) {
+ for (i = 0; i < 256; i++)
+ template_buffer[i] = i;
+ }
- if (_u_string == NULL || PyUnicode_READY(_u_string) == -1)
- return result;
+ u = PyUnicode_Decode((char*) template_buffer, 256, name, "replace");
+ if (u == NULL || PyUnicode_READY(u))
+ return XML_STATUS_ERROR;
- kind = PyUnicode_KIND(_u_string);
- data = PyUnicode_DATA(_u_string);
+ if (PyUnicode_GET_LENGTH(u) != 256) {
+ Py_DECREF(u);
+ PyErr_SetString(PyExc_ValueError,
+ "multi-byte encodings are not supported");
+ return XML_STATUS_ERROR;
+ }
+ kind = PyUnicode_KIND(u);
+ data = PyUnicode_DATA(u);
for (i = 0; i < 256; i++) {
- /* Stupid to access directly, but fast */
- Py_UCS4 c = PyUnicode_READ(kind, data, i);
- if (c == Py_UNICODE_REPLACEMENT_CHARACTER)
- info->map[i] = -1;
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+ if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
+ info->map[i] = ch;
else
- info->map[i] = c;
+ info->map[i] = -1;
}
+
info->data = NULL;
info->convert = NULL;
info->release = NULL;
- result = 1;
- Py_DECREF(_u_string);
- return result;
+ Py_DECREF(u);
+
+ return XML_STATUS_OK;
}
@@ -1752,7 +1748,6 @@ MODULE_INITFUNC(void)
Py_BuildValue("(iii)", info.major,
info.minor, info.micro));
}
- init_template_buffer();
/* XXX When Expat supports some way of figuring out how it was
compiled, this should check and set native_encoding
appropriately.
@@ -1938,6 +1933,7 @@ MODULE_INITFUNC(void)
capi.SetUserData = XML_SetUserData;
capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
capi.SetEncoding = XML_SetEncoding;
+ capi.DefaultUnknownEncodingHandler = PyUnknownEncodingHandler;
/* export using capsule */
capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);