summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2007-07-23 17:42:32 (GMT)
committerGuido van Rossum <guido@python.org>2007-07-23 17:42:32 (GMT)
commit4ca947183154a7cfc7a6ccbb2e5c856a16a5dce3 (patch)
tree2d0aa37ade9702ac5af2725414d6ab24b0125171
parent9e473c28e4eb65e86fc11a5717cc6e7e1febd898 (diff)
downloadcpython-4ca947183154a7cfc7a6ccbb2e5c856a16a5dce3.zip
cpython-4ca947183154a7cfc7a6ccbb2e5c856a16a5dce3.tar.gz
cpython-4ca947183154a7cfc7a6ccbb2e5c856a16a5dce3.tar.bz2
SF patch# 1759016 by Joe Gregorio, who writes:
1. Removed "returns_unicode" attribute, associated code in the module to support that attribute, and all tests associated with it. 2. Parsed data is now returned as unicode strings. 3. Changed input tests to use io.BytesIO instead of StringIO, to reflect the byte processing nature of expat.
-rw-r--r--Doc/lib/libpyexpat.tex17
-rw-r--r--Lib/test/test_pyexpat.py65
-rw-r--r--Modules/pyexpat.c94
3 files changed, 28 insertions, 148 deletions
diff --git a/Doc/lib/libpyexpat.tex b/Doc/lib/libpyexpat.tex
index a0ea8a1..ed0bf6a 100644
--- a/Doc/lib/libpyexpat.tex
+++ b/Doc/lib/libpyexpat.tex
@@ -151,8 +151,8 @@ Create a ``child'' parser which can be used to parse an external
parsed entity referred to by content parsed by the parent parser. The
\var{context} parameter should be the string passed to the
\method{ExternalEntityRefHandler()} handler function, described below.
-The child parser is created with the \member{ordered_attributes},
-\member{returns_unicode} and \member{specified_attributes} set to the
+The child parser is created with the \member{ordered_attributes}
+and \member{specified_attributes} set to the
values of this parser.
\end{methoddesc}
@@ -214,16 +214,6 @@ any time.
\versionadded{2.1}
\end{memberdesc}
-\begin{memberdesc}[xmlparser]{returns_unicode}
-If this attribute is set to a non-zero integer, the handler functions
-will be passed Unicode strings. If \member{returns_unicode} is
-\constant{False}, 8-bit strings containing UTF-8 encoded data will be
-passed to the handlers. This is \constant{True} by default when
-Python is built with Unicode support.
-\versionchanged[Can be changed at any time to affect the result
- type]{1.6}
-\end{memberdesc}
-
\begin{memberdesc}[xmlparser]{specified_attributes}
If set to a non-zero integer, the parser will report only those
attributes which were specified in the document instance and not those
@@ -290,8 +280,7 @@ Called when the XML declaration is parsed. The XML declaration is the
(optional) declaration of the applicable version of the XML
recommendation, the encoding of the document text, and an optional
``standalone'' declaration. \var{version} and \var{encoding} will be
-strings of the type dictated by the \member{returns_unicode}
-attribute, and \var{standalone} will be \code{1} if the document is
+strings, and \var{standalone} will be \code{1} if the document is
declared standalone, \code{0} if it is declared not to be standalone,
or \code{-1} if the standalone clause was omitted.
This is only available with Expat version 1.95.0 or newer.
diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py
index 62c5d02..cb4e6eb 100644
--- a/Lib/test/test_pyexpat.py
+++ b/Lib/test/test_pyexpat.py
@@ -1,7 +1,7 @@
# XXX TypeErrors on calling handlers, or on bad return values from a
# handler, are obscure and unhelpful.
-import StringIO
+from io import BytesIO
import unittest
import pyexpat
@@ -20,11 +20,6 @@ class SetAttributeTest(unittest.TestCase):
[0, 0],
]
- def test_returns_unicode(self):
- for x, y in self.set_get_pairs:
- self.parser.returns_unicode = x
- self.assertEquals(self.parser.returns_unicode, y)
-
def test_ordered_attributes(self):
for x, y in self.set_get_pairs:
self.parser.ordered_attributes = x
@@ -36,7 +31,7 @@ class SetAttributeTest(unittest.TestCase):
self.assertEquals(self.parser.specified_attributes, y)
-data = '''\
+data = b'''\
<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
<?xml-stylesheet href="stylesheet.css"?>
<!-- comment data -->
@@ -130,22 +125,12 @@ class ParseTest(unittest.TestCase):
'ExternalEntityRefHandler'
]
- def test_utf8(self):
-
- out = self.Outputter()
- parser = expat.ParserCreate(namespace_separator='!')
- for name in self.handler_names:
- setattr(parser, name, getattr(out, name))
- parser.returns_unicode = 0
- parser.Parse(data, 1)
-
- # Verify output
- op = out.out
+ def _verify_parse_output(self, op):
self.assertEquals(op[0], 'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'')
self.assertEquals(op[1], "Comment: ' comment data '")
self.assertEquals(op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)")
self.assertEquals(op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')")
- self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\xe1\\xbd\\x80'}")
+ self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\u1f40'}")
self.assertEquals(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'")
self.assertEquals(op[6], "Start element: 'http://www.python.org/namespace!subelement' {}")
self.assertEquals(op[7], "Character data: 'Contents of subelements'")
@@ -159,65 +144,31 @@ class ParseTest(unittest.TestCase):
self.assertEquals(op[15], "External entity ref: (None, 'entity.file', None)")
self.assertEquals(op[16], "End element: 'root'")
+
def test_unicode(self):
# Try the parse again, this time producing Unicode output
out = self.Outputter()
parser = expat.ParserCreate(namespace_separator='!')
- parser.returns_unicode = 1
for name in self.handler_names:
setattr(parser, name, getattr(out, name))
parser.Parse(data, 1)
op = out.out
- self.assertEquals(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'')
- self.assertEquals(op[1], "Comment: u' comment data '")
- self.assertEquals(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)")
- self.assertEquals(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')")
- self.assertEquals(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}")
- self.assertEquals(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'")
- self.assertEquals(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}")
- self.assertEquals(op[7], "Character data: u'Contents of subelements'")
- self.assertEquals(op[8], "End element: u'http://www.python.org/namespace!subelement'")
- self.assertEquals(op[9], "End of NS decl: u'myns'")
- self.assertEquals(op[10], "Start element: u'sub2' {}")
- self.assertEquals(op[11], 'Start of CDATA section')
- self.assertEquals(op[12], "Character data: u'contents of CDATA section'")
- self.assertEquals(op[13], 'End of CDATA section')
- self.assertEquals(op[14], "End element: u'sub2'")
- self.assertEquals(op[15], "External entity ref: (None, u'entity.file', None)")
- self.assertEquals(op[16], "End element: u'root'")
+ self._verify_parse_output(op)
def test_parse_file(self):
# Try parsing a file
out = self.Outputter()
parser = expat.ParserCreate(namespace_separator='!')
- parser.returns_unicode = 1
for name in self.handler_names:
setattr(parser, name, getattr(out, name))
- file = StringIO.StringIO(data)
+ file = BytesIO(data)
parser.ParseFile(file)
op = out.out
- self.assertEquals(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'')
- self.assertEquals(op[1], "Comment: u' comment data '")
- self.assertEquals(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)")
- self.assertEquals(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')")
- self.assertEquals(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}")
- self.assertEquals(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'")
- self.assertEquals(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}")
- self.assertEquals(op[7], "Character data: u'Contents of subelements'")
- self.assertEquals(op[8], "End element: u'http://www.python.org/namespace!subelement'")
- self.assertEquals(op[9], "End of NS decl: u'myns'")
- self.assertEquals(op[10], "Start element: u'sub2' {}")
- self.assertEquals(op[11], 'Start of CDATA section')
- self.assertEquals(op[12], "Character data: u'contents of CDATA section'")
- self.assertEquals(op[13], 'End of CDATA section')
- self.assertEquals(op[14], "End element: u'sub2'")
- self.assertEquals(op[15], "External entity ref: (None, u'entity.file', None)")
- self.assertEquals(op[16], "End element: u'root'")
-
+ self._verify_parse_output(op)
class NamespaceSeparatorTest(unittest.TestCase):
def test_legal(self):
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 13c1d27..8638b2e 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -62,8 +62,6 @@ typedef struct {
PyObject_HEAD
XML_Parser itself;
- int returns_unicode; /* True if Unicode strings are returned;
- if false, UTF-8 strings are returned */
int ordered_attributes; /* Return attributes as a list. */
int specified_attributes; /* Report only specified attributes. */
int in_callback; /* Is a callback active? */
@@ -185,35 +183,6 @@ conv_string_len_to_unicode(const XML_Char *str, int len)
return PyUnicode_DecodeUTF8((const char *)str, len, "strict");
}
-/* Convert a string of XML_Chars into an 8-bit Python string.
- Returns None if str is a null pointer. */
-
-static PyObject *
-conv_string_to_utf8(const XML_Char *str)
-{
- /* XXX currently this code assumes that XML_Char is 8-bit,
- and hence in UTF-8. */
- /* UTF-8 from Expat, UTF-8 desired */
- if (str == NULL) {
- Py_INCREF(Py_None);
- return Py_None;
- }
- return PyString_FromString(str);
-}
-
-static PyObject *
-conv_string_len_to_utf8(const XML_Char *str, int len)
-{
- /* XXX currently this code assumes that XML_Char is 8-bit,
- and hence in UTF-8. */
- /* UTF-8 from Expat, UTF-8 desired */
- if (str == NULL) {
- Py_INCREF(Py_None);
- return Py_None;
- }
- return PyString_FromStringAndSize((const char *)str, len);
-}
-
/* Callback routines */
static void clear_handlers(xmlparseobject *self, int initial);
@@ -411,14 +380,10 @@ call_with_frame(PyCodeObject *c, PyObject* func, PyObject* args,
return res;
}
-/* Python 2.0 and later versions, when built with Unicode support */
-#define STRING_CONV_FUNC (self->returns_unicode \
- ? conv_string_to_unicode : conv_string_to_utf8)
-
static PyObject*
string_intern(xmlparseobject *self, const char* str)
{
- PyObject *result = STRING_CONV_FUNC(str);
+ PyObject *result = conv_string_to_unicode(str);
PyObject *value;
/* result can be NULL if the unicode conversion failed. */
if (!result)
@@ -449,9 +414,7 @@ call_character_handler(xmlparseobject *self, const XML_Char *buffer, int len)
args = PyTuple_New(1);
if (args == NULL)
return -1;
- temp = (self->returns_unicode
- ? conv_string_len_to_unicode(buffer, len)
- : conv_string_len_to_utf8(buffer, len));
+ temp = (conv_string_len_to_unicode(buffer, len));
if (temp == NULL) {
Py_DECREF(args);
flag_error(self);
@@ -556,7 +519,7 @@ my_StartElementHandler(void *userData,
Py_DECREF(container);
return;
}
- v = STRING_CONV_FUNC((XML_Char *) atts[i+1]);
+ v = conv_string_to_unicode((XML_Char *) atts[i+1]);
if (v == NULL) {
flag_error(self);
Py_DECREF(container);
@@ -645,7 +608,7 @@ VOID_HANDLER(ProcessingInstruction,
(void *userData,
const XML_Char *target,
const XML_Char *data),
- ("(NO&)", string_intern(self, target), STRING_CONV_FUNC,data))
+ ("(NO&)", string_intern(self, target), conv_string_to_unicode ,data))
VOID_HANDLER(UnparsedEntityDecl,
(void *userData,
@@ -671,9 +634,7 @@ VOID_HANDLER(EntityDecl,
const XML_Char *notationName),
("NiNNNNN",
string_intern(self, entityName), is_parameter_entity,
- (self->returns_unicode
- ? conv_string_len_to_unicode(value, value_length)
- : conv_string_len_to_utf8(value, value_length)),
+ (conv_string_len_to_unicode(value, value_length)),
string_intern(self, base), string_intern(self, systemId),
string_intern(self, publicId),
string_intern(self, notationName)))
@@ -684,7 +645,7 @@ VOID_HANDLER(XmlDecl,
const XML_Char *encoding,
int standalone),
("(O&O&i)",
- STRING_CONV_FUNC,version, STRING_CONV_FUNC,encoding,
+ conv_string_to_unicode ,version, conv_string_to_unicode ,encoding,
standalone))
static PyObject *
@@ -727,10 +688,7 @@ my_ElementDeclHandler(void *userData,
if (flush_character_buffer(self) < 0)
goto finally;
- modelobj = conv_content_model(model,
- (self->returns_unicode
- ? conv_string_to_unicode
- : conv_string_to_utf8));
+ modelobj = conv_content_model(model, (conv_string_to_unicode));
if (modelobj == NULL) {
flag_error(self);
goto finally;
@@ -772,7 +730,7 @@ VOID_HANDLER(AttlistDecl,
int isrequired),
("(NNO&O&i)",
string_intern(self, elname), string_intern(self, attname),
- STRING_CONV_FUNC,att_type, STRING_CONV_FUNC,dflt,
+ conv_string_to_unicode ,att_type, conv_string_to_unicode ,dflt,
isrequired))
#if XML_COMBINED_VERSION >= 19504
@@ -808,7 +766,7 @@ VOID_HANDLER(EndNamespaceDecl,
VOID_HANDLER(Comment,
(void *userData, const XML_Char *data),
- ("(O&)", STRING_CONV_FUNC,data))
+ ("(O&)", conv_string_to_unicode ,data))
VOID_HANDLER(StartCdataSection,
(void *userData),
@@ -820,15 +778,11 @@ VOID_HANDLER(EndCdataSection,
VOID_HANDLER(Default,
(void *userData, const XML_Char *s, int len),
- ("(N)", (self->returns_unicode
- ? conv_string_len_to_unicode(s,len)
- : conv_string_len_to_utf8(s,len))))
+ ("(N)", (conv_string_len_to_unicode(s,len))))
VOID_HANDLER(DefaultHandlerExpand,
(void *userData, const XML_Char *s, int len),
- ("(N)", (self->returns_unicode
- ? conv_string_len_to_unicode(s,len)
- : conv_string_len_to_utf8(s,len))))
+ ("(N)", (conv_string_len_to_unicode(s,len))))
INT_HANDLER(NotStandalone,
(void *userData),
@@ -842,7 +796,7 @@ RC_HANDLER(int, ExternalEntityRef,
const XML_Char *publicId),
int rc=0;,
("(O&NNN)",
- STRING_CONV_FUNC,context, string_intern(self, base),
+ conv_string_to_unicode ,context, string_intern(self, base),
string_intern(self, systemId), string_intern(self, publicId)),
rc = PyInt_AsLong(rv);, rc,
XML_GetUserData(parser))
@@ -924,13 +878,13 @@ readinst(char *buf, int buf_size, PyObject *meth)
goto finally;
/* XXX what to do if it returns a Unicode string? */
- if (!PyString_Check(str)) {
+ if (!PyBytes_Check(str)) {
PyErr_Format(PyExc_TypeError,
- "read() did not return a string object (type=%.400s)",
+ "read() did not return a bytes object (type=%.400s)",
Py_Type(str)->tp_name);
goto finally;
}
- len = PyString_GET_SIZE(str);
+ len = PyBytes_GET_SIZE(str);
if (len > buf_size) {
PyErr_Format(PyExc_ValueError,
"read() returned too much data: "
@@ -938,7 +892,7 @@ readinst(char *buf, int buf_size, PyObject *meth)
buf_size, len);
goto finally;
}
- memcpy(buf, PyString_AsString(str), len);
+ memcpy(buf, PyBytes_AsString(str), len);
finally:
Py_XDECREF(arg);
Py_XDECREF(str);
@@ -1044,7 +998,7 @@ xmlparse_GetInputContext(xmlparseobject *self, PyObject *unused)
= XML_GetInputContext(self->itself, &offset, &size);
if (buffer != NULL)
- return PyString_FromStringAndSize(buffer + offset,
+ return PyBytes_FromStringAndSize(buffer + offset,
size - offset);
else
Py_RETURN_NONE;
@@ -1098,7 +1052,6 @@ xmlparse_ExternalEntityParserCreate(xmlparseobject *self, PyObject *args)
}
else
new_parser->buffer = NULL;
- new_parser->returns_unicode = self->returns_unicode;
new_parser->ordered_attributes = self->ordered_attributes;
new_parser->specified_attributes = self->specified_attributes;
new_parser->in_callback = 0;
@@ -1283,8 +1236,6 @@ newxmlparseobject(char *encoding, char *namespace_separator, PyObject *intern)
if (self == NULL)
return NULL;
- self->returns_unicode = 1;
-
self->buffer = NULL;
self->buffer_size = CHARACTER_DATA_BUFFER_SIZE;
self->buffer_used = 0;
@@ -1436,8 +1387,6 @@ xmlparse_getattr(xmlparseobject *self, char *name)
return get_pybool(self->ns_prefixes);
if (strcmp(name, "ordered_attributes") == 0)
return get_pybool(self->ordered_attributes);
- if (strcmp(name, "returns_unicode") == 0)
- return get_pybool((long) self->returns_unicode);
if (strcmp(name, "specified_attributes") == 0)
return get_pybool((long) self->specified_attributes);
if (strcmp(name, "intern") == 0) {
@@ -1482,7 +1431,6 @@ xmlparse_getattr(xmlparseobject *self, char *name)
APPEND(rc, "buffer_used");
APPEND(rc, "namespace_prefixes");
APPEND(rc, "ordered_attributes");
- APPEND(rc, "returns_unicode");
APPEND(rc, "specified_attributes");
APPEND(rc, "intern");
@@ -1570,14 +1518,6 @@ xmlparse_setattr(xmlparseobject *self, char *name, PyObject *v)
self->ordered_attributes = 0;
return 0;
}
- if (strcmp(name, "returns_unicode") == 0) {
- if (PyObject_IsTrue(v)) {
- self->returns_unicode = 1;
- }
- else
- self->returns_unicode = 0;
- return 0;
- }
if (strcmp(name, "specified_attributes") == 0) {
if (PyObject_IsTrue(v))
self->specified_attributes = 1;