diff options
-rw-r--r-- | Doc/lib/libpyexpat.tex | 17 | ||||
-rw-r--r-- | Lib/test/test_pyexpat.py | 65 | ||||
-rw-r--r-- | Modules/pyexpat.c | 94 |
3 files changed, 28 insertions, 148 deletions
diff --git a/Doc/lib/libpyexpat.tex b/Doc/lib/libpyexpat.tex index a0ea8a1..ed0bf6a 100644 --- a/Doc/lib/libpyexpat.tex +++ b/Doc/lib/libpyexpat.tex @@ -151,8 +151,8 @@ Create a ``child'' parser which can be used to parse an external parsed entity referred to by content parsed by the parent parser. The \var{context} parameter should be the string passed to the \method{ExternalEntityRefHandler()} handler function, described below. -The child parser is created with the \member{ordered_attributes}, -\member{returns_unicode} and \member{specified_attributes} set to the +The child parser is created with the \member{ordered_attributes} +and \member{specified_attributes} set to the values of this parser. \end{methoddesc} @@ -214,16 +214,6 @@ any time. \versionadded{2.1} \end{memberdesc} -\begin{memberdesc}[xmlparser]{returns_unicode} -If this attribute is set to a non-zero integer, the handler functions -will be passed Unicode strings. If \member{returns_unicode} is -\constant{False}, 8-bit strings containing UTF-8 encoded data will be -passed to the handlers. This is \constant{True} by default when -Python is built with Unicode support. -\versionchanged[Can be changed at any time to affect the result - type]{1.6} -\end{memberdesc} - \begin{memberdesc}[xmlparser]{specified_attributes} If set to a non-zero integer, the parser will report only those attributes which were specified in the document instance and not those @@ -290,8 +280,7 @@ Called when the XML declaration is parsed. The XML declaration is the (optional) declaration of the applicable version of the XML recommendation, the encoding of the document text, and an optional ``standalone'' declaration. \var{version} and \var{encoding} will be -strings of the type dictated by the \member{returns_unicode} -attribute, and \var{standalone} will be \code{1} if the document is +strings, and \var{standalone} will be \code{1} if the document is declared standalone, \code{0} if it is declared not to be standalone, or \code{-1} if the standalone clause was omitted. This is only available with Expat version 1.95.0 or newer. diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index 62c5d02..cb4e6eb 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -1,7 +1,7 @@ # XXX TypeErrors on calling handlers, or on bad return values from a # handler, are obscure and unhelpful. -import StringIO +from io import BytesIO import unittest import pyexpat @@ -20,11 +20,6 @@ class SetAttributeTest(unittest.TestCase): [0, 0], ] - def test_returns_unicode(self): - for x, y in self.set_get_pairs: - self.parser.returns_unicode = x - self.assertEquals(self.parser.returns_unicode, y) - def test_ordered_attributes(self): for x, y in self.set_get_pairs: self.parser.ordered_attributes = x @@ -36,7 +31,7 @@ class SetAttributeTest(unittest.TestCase): self.assertEquals(self.parser.specified_attributes, y) -data = '''\ +data = b'''\ <?xml version="1.0" encoding="iso-8859-1" standalone="no"?> <?xml-stylesheet href="stylesheet.css"?> <!-- comment data --> @@ -130,22 +125,12 @@ class ParseTest(unittest.TestCase): 'ExternalEntityRefHandler' ] - def test_utf8(self): - - out = self.Outputter() - parser = expat.ParserCreate(namespace_separator='!') - for name in self.handler_names: - setattr(parser, name, getattr(out, name)) - parser.returns_unicode = 0 - parser.Parse(data, 1) - - # Verify output - op = out.out + def _verify_parse_output(self, op): self.assertEquals(op[0], 'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'') self.assertEquals(op[1], "Comment: ' comment data '") self.assertEquals(op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)") self.assertEquals(op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')") - self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\xe1\\xbd\\x80'}") + self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\u1f40'}") self.assertEquals(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'") self.assertEquals(op[6], "Start element: 'http://www.python.org/namespace!subelement' {}") self.assertEquals(op[7], "Character data: 'Contents of subelements'") @@ -159,65 +144,31 @@ class ParseTest(unittest.TestCase): self.assertEquals(op[15], "External entity ref: (None, 'entity.file', None)") self.assertEquals(op[16], "End element: 'root'") + def test_unicode(self): # Try the parse again, this time producing Unicode output out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') - parser.returns_unicode = 1 for name in self.handler_names: setattr(parser, name, getattr(out, name)) parser.Parse(data, 1) op = out.out - self.assertEquals(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'') - self.assertEquals(op[1], "Comment: u' comment data '") - self.assertEquals(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)") - self.assertEquals(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')") - self.assertEquals(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}") - self.assertEquals(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'") - self.assertEquals(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}") - self.assertEquals(op[7], "Character data: u'Contents of subelements'") - self.assertEquals(op[8], "End element: u'http://www.python.org/namespace!subelement'") - self.assertEquals(op[9], "End of NS decl: u'myns'") - self.assertEquals(op[10], "Start element: u'sub2' {}") - self.assertEquals(op[11], 'Start of CDATA section') - self.assertEquals(op[12], "Character data: u'contents of CDATA section'") - self.assertEquals(op[13], 'End of CDATA section') - self.assertEquals(op[14], "End element: u'sub2'") - self.assertEquals(op[15], "External entity ref: (None, u'entity.file', None)") - self.assertEquals(op[16], "End element: u'root'") + self._verify_parse_output(op) def test_parse_file(self): # Try parsing a file out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') - parser.returns_unicode = 1 for name in self.handler_names: setattr(parser, name, getattr(out, name)) - file = StringIO.StringIO(data) + file = BytesIO(data) parser.ParseFile(file) op = out.out - self.assertEquals(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'') - self.assertEquals(op[1], "Comment: u' comment data '") - self.assertEquals(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)") - self.assertEquals(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')") - self.assertEquals(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}") - self.assertEquals(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'") - self.assertEquals(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}") - self.assertEquals(op[7], "Character data: u'Contents of subelements'") - self.assertEquals(op[8], "End element: u'http://www.python.org/namespace!subelement'") - self.assertEquals(op[9], "End of NS decl: u'myns'") - self.assertEquals(op[10], "Start element: u'sub2' {}") - self.assertEquals(op[11], 'Start of CDATA section') - self.assertEquals(op[12], "Character data: u'contents of CDATA section'") - self.assertEquals(op[13], 'End of CDATA section') - self.assertEquals(op[14], "End element: u'sub2'") - self.assertEquals(op[15], "External entity ref: (None, u'entity.file', None)") - self.assertEquals(op[16], "End element: u'root'") - + self._verify_parse_output(op) class NamespaceSeparatorTest(unittest.TestCase): def test_legal(self): diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 13c1d27..8638b2e 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -62,8 +62,6 @@ typedef struct { PyObject_HEAD XML_Parser itself; - int returns_unicode; /* True if Unicode strings are returned; - if false, UTF-8 strings are returned */ int ordered_attributes; /* Return attributes as a list. */ int specified_attributes; /* Report only specified attributes. */ int in_callback; /* Is a callback active? */ @@ -185,35 +183,6 @@ conv_string_len_to_unicode(const XML_Char *str, int len) return PyUnicode_DecodeUTF8((const char *)str, len, "strict"); } -/* Convert a string of XML_Chars into an 8-bit Python string. - Returns None if str is a null pointer. */ - -static PyObject * -conv_string_to_utf8(const XML_Char *str) -{ - /* XXX currently this code assumes that XML_Char is 8-bit, - and hence in UTF-8. */ - /* UTF-8 from Expat, UTF-8 desired */ - if (str == NULL) { - Py_INCREF(Py_None); - return Py_None; - } - return PyString_FromString(str); -} - -static PyObject * -conv_string_len_to_utf8(const XML_Char *str, int len) -{ - /* XXX currently this code assumes that XML_Char is 8-bit, - and hence in UTF-8. */ - /* UTF-8 from Expat, UTF-8 desired */ - if (str == NULL) { - Py_INCREF(Py_None); - return Py_None; - } - return PyString_FromStringAndSize((const char *)str, len); -} - /* Callback routines */ static void clear_handlers(xmlparseobject *self, int initial); @@ -411,14 +380,10 @@ call_with_frame(PyCodeObject *c, PyObject* func, PyObject* args, return res; } -/* Python 2.0 and later versions, when built with Unicode support */ -#define STRING_CONV_FUNC (self->returns_unicode \ - ? conv_string_to_unicode : conv_string_to_utf8) - static PyObject* string_intern(xmlparseobject *self, const char* str) { - PyObject *result = STRING_CONV_FUNC(str); + PyObject *result = conv_string_to_unicode(str); PyObject *value; /* result can be NULL if the unicode conversion failed. */ if (!result) @@ -449,9 +414,7 @@ call_character_handler(xmlparseobject *self, const XML_Char *buffer, int len) args = PyTuple_New(1); if (args == NULL) return -1; - temp = (self->returns_unicode - ? conv_string_len_to_unicode(buffer, len) - : conv_string_len_to_utf8(buffer, len)); + temp = (conv_string_len_to_unicode(buffer, len)); if (temp == NULL) { Py_DECREF(args); flag_error(self); @@ -556,7 +519,7 @@ my_StartElementHandler(void *userData, Py_DECREF(container); return; } - v = STRING_CONV_FUNC((XML_Char *) atts[i+1]); + v = conv_string_to_unicode((XML_Char *) atts[i+1]); if (v == NULL) { flag_error(self); Py_DECREF(container); @@ -645,7 +608,7 @@ VOID_HANDLER(ProcessingInstruction, (void *userData, const XML_Char *target, const XML_Char *data), - ("(NO&)", string_intern(self, target), STRING_CONV_FUNC,data)) + ("(NO&)", string_intern(self, target), conv_string_to_unicode ,data)) VOID_HANDLER(UnparsedEntityDecl, (void *userData, @@ -671,9 +634,7 @@ VOID_HANDLER(EntityDecl, const XML_Char *notationName), ("NiNNNNN", string_intern(self, entityName), is_parameter_entity, - (self->returns_unicode - ? conv_string_len_to_unicode(value, value_length) - : conv_string_len_to_utf8(value, value_length)), + (conv_string_len_to_unicode(value, value_length)), string_intern(self, base), string_intern(self, systemId), string_intern(self, publicId), string_intern(self, notationName))) @@ -684,7 +645,7 @@ VOID_HANDLER(XmlDecl, const XML_Char *encoding, int standalone), ("(O&O&i)", - STRING_CONV_FUNC,version, STRING_CONV_FUNC,encoding, + conv_string_to_unicode ,version, conv_string_to_unicode ,encoding, standalone)) static PyObject * @@ -727,10 +688,7 @@ my_ElementDeclHandler(void *userData, if (flush_character_buffer(self) < 0) goto finally; - modelobj = conv_content_model(model, - (self->returns_unicode - ? conv_string_to_unicode - : conv_string_to_utf8)); + modelobj = conv_content_model(model, (conv_string_to_unicode)); if (modelobj == NULL) { flag_error(self); goto finally; @@ -772,7 +730,7 @@ VOID_HANDLER(AttlistDecl, int isrequired), ("(NNO&O&i)", string_intern(self, elname), string_intern(self, attname), - STRING_CONV_FUNC,att_type, STRING_CONV_FUNC,dflt, + conv_string_to_unicode ,att_type, conv_string_to_unicode ,dflt, isrequired)) #if XML_COMBINED_VERSION >= 19504 @@ -808,7 +766,7 @@ VOID_HANDLER(EndNamespaceDecl, VOID_HANDLER(Comment, (void *userData, const XML_Char *data), - ("(O&)", STRING_CONV_FUNC,data)) + ("(O&)", conv_string_to_unicode ,data)) VOID_HANDLER(StartCdataSection, (void *userData), @@ -820,15 +778,11 @@ VOID_HANDLER(EndCdataSection, VOID_HANDLER(Default, (void *userData, const XML_Char *s, int len), - ("(N)", (self->returns_unicode - ? conv_string_len_to_unicode(s,len) - : conv_string_len_to_utf8(s,len)))) + ("(N)", (conv_string_len_to_unicode(s,len)))) VOID_HANDLER(DefaultHandlerExpand, (void *userData, const XML_Char *s, int len), - ("(N)", (self->returns_unicode - ? conv_string_len_to_unicode(s,len) - : conv_string_len_to_utf8(s,len)))) + ("(N)", (conv_string_len_to_unicode(s,len)))) INT_HANDLER(NotStandalone, (void *userData), @@ -842,7 +796,7 @@ RC_HANDLER(int, ExternalEntityRef, const XML_Char *publicId), int rc=0;, ("(O&NNN)", - STRING_CONV_FUNC,context, string_intern(self, base), + conv_string_to_unicode ,context, string_intern(self, base), string_intern(self, systemId), string_intern(self, publicId)), rc = PyInt_AsLong(rv);, rc, XML_GetUserData(parser)) @@ -924,13 +878,13 @@ readinst(char *buf, int buf_size, PyObject *meth) goto finally; /* XXX what to do if it returns a Unicode string? */ - if (!PyString_Check(str)) { + if (!PyBytes_Check(str)) { PyErr_Format(PyExc_TypeError, - "read() did not return a string object (type=%.400s)", + "read() did not return a bytes object (type=%.400s)", Py_Type(str)->tp_name); goto finally; } - len = PyString_GET_SIZE(str); + len = PyBytes_GET_SIZE(str); if (len > buf_size) { PyErr_Format(PyExc_ValueError, "read() returned too much data: " @@ -938,7 +892,7 @@ readinst(char *buf, int buf_size, PyObject *meth) buf_size, len); goto finally; } - memcpy(buf, PyString_AsString(str), len); + memcpy(buf, PyBytes_AsString(str), len); finally: Py_XDECREF(arg); Py_XDECREF(str); @@ -1044,7 +998,7 @@ xmlparse_GetInputContext(xmlparseobject *self, PyObject *unused) = XML_GetInputContext(self->itself, &offset, &size); if (buffer != NULL) - return PyString_FromStringAndSize(buffer + offset, + return PyBytes_FromStringAndSize(buffer + offset, size - offset); else Py_RETURN_NONE; @@ -1098,7 +1052,6 @@ xmlparse_ExternalEntityParserCreate(xmlparseobject *self, PyObject *args) } else new_parser->buffer = NULL; - new_parser->returns_unicode = self->returns_unicode; new_parser->ordered_attributes = self->ordered_attributes; new_parser->specified_attributes = self->specified_attributes; new_parser->in_callback = 0; @@ -1283,8 +1236,6 @@ newxmlparseobject(char *encoding, char *namespace_separator, PyObject *intern) if (self == NULL) return NULL; - self->returns_unicode = 1; - self->buffer = NULL; self->buffer_size = CHARACTER_DATA_BUFFER_SIZE; self->buffer_used = 0; @@ -1436,8 +1387,6 @@ xmlparse_getattr(xmlparseobject *self, char *name) return get_pybool(self->ns_prefixes); if (strcmp(name, "ordered_attributes") == 0) return get_pybool(self->ordered_attributes); - if (strcmp(name, "returns_unicode") == 0) - return get_pybool((long) self->returns_unicode); if (strcmp(name, "specified_attributes") == 0) return get_pybool((long) self->specified_attributes); if (strcmp(name, "intern") == 0) { @@ -1482,7 +1431,6 @@ xmlparse_getattr(xmlparseobject *self, char *name) APPEND(rc, "buffer_used"); APPEND(rc, "namespace_prefixes"); APPEND(rc, "ordered_attributes"); - APPEND(rc, "returns_unicode"); APPEND(rc, "specified_attributes"); APPEND(rc, "intern"); @@ -1570,14 +1518,6 @@ xmlparse_setattr(xmlparseobject *self, char *name, PyObject *v) self->ordered_attributes = 0; return 0; } - if (strcmp(name, "returns_unicode") == 0) { - if (PyObject_IsTrue(v)) { - self->returns_unicode = 1; - } - else - self->returns_unicode = 0; - return 0; - } if (strcmp(name, "specified_attributes") == 0) { if (PyObject_IsTrue(v)) self->specified_attributes = 1; |