diff options
-rw-r--r-- | Lib/test/test_pyexpat.py | 95 | ||||
-rw-r--r-- | Modules/pyexpat.c | 177 |
2 files changed, 251 insertions, 21 deletions
diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index 22b3099..2415ff0 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -6,7 +6,7 @@ import pyexpat from xml.parsers import expat -from test_support import sortdict +from test_support import sortdict, TestFailed class Outputter: def StartElementHandler(self, name, attrs): @@ -218,3 +218,96 @@ for entry in L: print "(it didn't)" print "L =", `L` break + +# Tests of the buffer_text attribute. +import sys + +class TextCollector: + def __init__(self, parser): + self.stuff = [] + + def check(self, expected, label): + require(self.stuff == expected, + "%s\nstuff = %s\nexpected = %s" + % (label, `self.stuff`, `map(unicode, expected)`)) + + def CharacterDataHandler(self, text): + self.stuff.append(text) + + def StartElementHandler(self, name, attrs): + self.stuff.append("<%s>" % name) + bt = attrs.get("buffer-text") + if bt == "yes": + parser.buffer_text = 1 + elif bt == "no": + parser.buffer_text = 0 + + def EndElementHandler(self, name): + self.stuff.append("</%s>" % name) + + def CommentHandler(self, data): + self.stuff.append("<!--%s-->" % data) + +def require(cond, label): + # similar to confirm(), but no extraneous output + if not cond: + raise TestFailed(label) + +def setup(handlers=[]): + parser = expat.ParserCreate() + require(not parser.buffer_text, + "buffer_text not disabled by default") + parser.buffer_text = 1 + handler = TextCollector(parser) + parser.CharacterDataHandler = handler.CharacterDataHandler + for name in handlers: + setattr(parser, name, getattr(handler, name)) + return parser, handler + +parser, handler = setup() +require(parser.buffer_text, + "text buffering either not acknowledged or not enabled") +parser.Parse("<a>1<b/>2<c/>3</a>", 1) +handler.check(["123"], + "buffered text not properly collapsed") + +# XXX This test exposes more detail of Expat's text chunking than we +# XXX like, but it tests what we need to concisely. +parser, handler = setup(["StartElementHandler"]) +parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1) +handler.check(["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"], + "buffering control not reacting as expected") + +parser, handler = setup() +parser.Parse("<a>1<b/><2><c/> \n 3</a>", 1) +handler.check(["1<2> \n 3"], + "buffered text not properly collapsed") + +parser, handler = setup(["StartElementHandler"]) +parser.Parse("<a>1<b/>2<c/>3</a>", 1) +handler.check(["<a>", "1", "<b>", "2", "<c>", "3"], + "buffered text not properly split") + +parser, handler = setup(["StartElementHandler", "EndElementHandler"]) +parser.CharacterDataHandler = None +parser.Parse("<a>1<b/>2<c/>3</a>", 1) +handler.check(["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"], + "huh?") + +parser, handler = setup(["StartElementHandler", "EndElementHandler"]) +parser.Parse("<a>1<b></b>2<c/>3</a>", 1) +handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"], + "huh?") + +parser, handler = setup(["CommentHandler", "EndElementHandler", + "StartElementHandler"]) +parser.Parse("<a>1<b/>2<c></c>345</a> ", 1) +handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"], + "buffered text not properly split") + +parser, handler = setup(["CommentHandler", "EndElementHandler", + "StartElementHandler"]) +parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1) +handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", + "<!--abc-->", "4", "<!--def-->", "5", "</a>"], + "buffered text not properly split") diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index ce82014..e1dbf2f 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -60,10 +60,16 @@ typedef struct { int ordered_attributes; /* Return attributes as a list. */ int specified_attributes; /* Report only specified attributes. */ int in_callback; /* Is a callback active? */ + XML_Char *buffer; /* Buffer used when accumulating characters */ + /* NULL if not enabled */ + int buffer_size; /* Size of buffer, in XML_Char units */ + int buffer_used; /* Buffer units in use */ PyObject *intern; /* Dictionary to intern strings */ PyObject **handlers; } xmlparseobject; +#define CHARACTER_DATA_BUFFER_SIZE 8192 + staticforward PyTypeObject Xmlparsetype; typedef void (*xmlhandlersetter)(XML_Parser *self, void *meth); @@ -313,6 +319,85 @@ string_intern(xmlparseobject *self, const char* str) return value; } +/* Return 0 on success, -1 on exception. + * flag_error() will be called before return if needed. + */ +static int +call_character_handler(xmlparseobject *self, const XML_Char *buffer, int len) +{ + PyObject *args; + PyObject *temp; + + args = PyTuple_New(1); + if (args == NULL) + return -1; +#ifdef Py_USING_UNICODE + temp = (self->returns_unicode + ? conv_string_len_to_unicode(buffer, len) + : conv_string_len_to_utf8(buffer, len)); +#else + temp = conv_string_len_to_utf8(buffer, len); +#endif + if (temp == NULL) { + Py_DECREF(args); + flag_error(self); + return -1; + } + PyTuple_SET_ITEM(args, 0, temp); + /* temp is now a borrowed reference; consider it unused. */ + self->in_callback = 1; + temp = call_with_frame(getcode(CharacterData, "CharacterData", __LINE__), + self->handlers[CharacterData], args); + /* temp is an owned reference again, or NULL */ + self->in_callback = 0; + Py_DECREF(args); + if (temp == NULL) { + flag_error(self); + return -1; + } + Py_DECREF(temp); + return 0; +} + +static int +flush_character_buffer(xmlparseobject *self) +{ + int rc; + if (self->buffer == NULL || self->buffer_used == 0) + return 0; + rc = call_character_handler(self, self->buffer, self->buffer_used); + self->buffer_used = 0; + return rc; +} + +static void +my_CharacterDataHandler(void *userData, const XML_Char *data, int len) +{ + xmlparseobject *self = (xmlparseobject *) userData; + if (self->buffer == NULL) + call_character_handler(self, data, len); + else { + if ((self->buffer_used + len) > self->buffer_size) { + if (flush_character_buffer(self) < 0) + return; + /* handler might have changed; drop the rest on the floor + * if there isn't a handler anymore + */ + if (!have_handler(self, CharacterData)) + return; + } + if (len > self->buffer_size) { + call_character_handler(self, data, len); + self->buffer_used = 0; + } + else { + memcpy(self->buffer + self->buffer_used, + data, len * sizeof(XML_Char)); + self->buffer_used += len; + } + } +} + static void my_StartElementHandler(void *userData, const XML_Char *name, const XML_Char *atts[]) @@ -323,6 +408,8 @@ my_StartElementHandler(void *userData, PyObject *container, *rv, *args; int i, max; + if (flush_character_buffer(self) < 0) + return; /* Set max to the number of slots filled in atts[]; max/2 is * the number of attributes we need to process. */ @@ -402,6 +489,8 @@ my_##NAME##Handler PARAMS {\ INIT \ \ if (have_handler(self, NAME)) { \ + if (flush_character_buffer(self) < 0) \ + return RETURN; \ args = Py_BuildValue PARAM_FORMAT ;\ if (!args) { flag_error(self); return RETURN;} \ self->in_callback = 1; \ @@ -438,18 +527,6 @@ VOID_HANDLER(ProcessingInstruction, const XML_Char *data), ("(NO&)", string_intern(self, target), STRING_CONV_FUNC,data)) -#ifndef Py_USING_UNICODE -VOID_HANDLER(CharacterData, - (void *userData, const XML_Char *data, int len), - ("(N)", conv_string_len_to_utf8(data,len))) -#else -VOID_HANDLER(CharacterData, - (void *userData, const XML_Char *data, int len), - ("(N)", (self->returns_unicode - ? conv_string_len_to_unicode(data,len) - : conv_string_len_to_utf8(data,len)))) -#endif - VOID_HANDLER(UnparsedEntityDecl, (void *userData, const XML_Char *entityName, @@ -673,6 +750,9 @@ get_parse_result(xmlparseobject *self, int rv) if (rv == 0) { return set_error(self); } + if (flush_character_buffer(self) < 0) { + return NULL; + } return PyInt_FromLong(rv); } @@ -890,6 +970,17 @@ xmlparse_ExternalEntityParserCreate(xmlparseobject *self, PyObject *args) if (new_parser == NULL) return NULL; + new_parser->buffer_size = self->buffer_size; + new_parser->buffer_used = 0; + if (self->buffer != NULL) { + new_parser->buffer = malloc(new_parser->buffer_size); + if (new_parser->buffer == NULL) { + PyObject_GC_Del(new_parser); + return PyErr_NoMemory(); + } + } + else + new_parser->buffer = NULL; new_parser->returns_unicode = self->returns_unicode; new_parser->ordered_attributes = self->ordered_attributes; new_parser->specified_attributes = self->specified_attributes; @@ -913,10 +1004,10 @@ xmlparse_ExternalEntityParserCreate(xmlparseobject *self, PyObject *args) XML_SetUserData(new_parser->itself, (void *)new_parser); /* allocate and clear handlers first */ - for(i = 0; handler_info[i].name != NULL; i++) + for (i = 0; handler_info[i].name != NULL; i++) /* do nothing */; - new_parser->handlers = malloc(sizeof(PyObject *)*i); + new_parser->handlers = malloc(sizeof(PyObject *) * i); if (!new_parser->handlers) { Py_DECREF(new_parser); return PyErr_NoMemory(); @@ -1053,6 +1144,9 @@ newxmlparseobject(char *encoding, char *namespace_separator, PyObject *intern) self->returns_unicode = 1; #endif + self->buffer = NULL; + self->buffer_size = CHARACTER_DATA_BUFFER_SIZE; + self->buffer_used = 0; self->ordered_attributes = 0; self->specified_attributes = 0; self->in_callback = 0; @@ -1081,7 +1175,7 @@ newxmlparseobject(char *encoding, char *namespace_separator, PyObject *intern) XML_SetUnknownEncodingHandler(self->itself, (XML_UnknownEncodingHandler) PyUnknownEncodingHandler, NULL); #endif - for(i = 0; handler_info[i].name != NULL; i++) + for (i = 0; handler_info[i].name != NULL; i++) /* do nothing */; self->handlers = malloc(sizeof(PyObject *)*i); @@ -1118,6 +1212,10 @@ xmlparse_dealloc(xmlparseobject *self) free(self->handlers); self->handlers = NULL; } + if (self->buffer != NULL) { + free(self->buffer); + self->buffer = NULL; + } Py_XDECREF(self->intern); #if PY_MAJOR_VERSION == 1 && PY_MINOR_VERSION < 6 /* Code for versions before 1.6 */ @@ -1179,6 +1277,14 @@ xmlparse_getattr(xmlparseobject *self, char *name) return PyInt_FromLong((long) XML_GetErrorByteIndex(self->itself)); } + if (name[0] == 'b') { + if (strcmp(name, "buffer_size") == 0) + return PyInt_FromLong((long) self->buffer_size); + if (strcmp(name, "buffer_text") == 0) + return get_pybool(self->buffer != NULL); + if (strcmp(name, "buffer_used") == 0) + return PyInt_FromLong((long) self->buffer_used); + } if (strcmp(name, "ordered_attributes") == 0) return get_pybool(self->ordered_attributes); if (strcmp(name, "returns_unicode") == 0) @@ -1206,6 +1312,9 @@ xmlparse_getattr(xmlparseobject *self, char *name) PyList_Append(rc, PyString_FromString("ErrorLineNumber")); PyList_Append(rc, PyString_FromString("ErrorColumnNumber")); PyList_Append(rc, PyString_FromString("ErrorByteIndex")); + PyList_Append(rc, PyString_FromString("buffer_size")); + PyList_Append(rc, PyString_FromString("buffer_text")); + PyList_Append(rc, PyString_FromString("buffer_used")); PyList_Append(rc, PyString_FromString("ordered_attributes")); PyList_Append(rc, PyString_FromString("returns_unicode")); PyList_Append(rc, PyString_FromString("specified_attributes")); @@ -1246,6 +1355,25 @@ xmlparse_setattr(xmlparseobject *self, char *name, PyObject *v) PyErr_SetString(PyExc_RuntimeError, "Cannot delete attribute"); return -1; } + if (strcmp(name, "buffer_text") == 0) { + if (PyObject_IsTrue(v)) { + if (self->buffer == NULL) { + self->buffer = malloc(self->buffer_size); + if (self->buffer == NULL) { + PyErr_NoMemory(); + return -1; + } + self->buffer_used = 0; + } + } + else if (self->buffer != NULL) { + if (flush_character_buffer(self) < 0) + return -1; + free(self->buffer); + self->buffer = NULL; + } + return 0; + } if (strcmp(name, "ordered_attributes") == 0) { if (PyObject_IsTrue(v)) self->ordered_attributes = 1; @@ -1274,6 +1402,15 @@ xmlparse_setattr(xmlparseobject *self, char *name, PyObject *v) self->specified_attributes = 0; return 0; } + if (strcmp(name, "CharacterDataHandler") == 0) { + /* If we're changing the character data handler, flush all + * cached data with the old handler. Not sure there's a + * "right" thing to do, though, but this probably won't + * happen. + */ + if (flush_character_buffer(self) < 0) + return -1; + } if (sethandler(self, name, v)) { return 0; } @@ -1658,16 +1795,16 @@ statichere struct HandlerInfo handler_info[] = { (xmlhandler)my_CharacterDataHandler}, {"UnparsedEntityDeclHandler", (xmlhandlersetter)XML_SetUnparsedEntityDeclHandler, - (xmlhandler)my_UnparsedEntityDeclHandler }, + (xmlhandler)my_UnparsedEntityDeclHandler}, {"NotationDeclHandler", (xmlhandlersetter)XML_SetNotationDeclHandler, - (xmlhandler)my_NotationDeclHandler }, + (xmlhandler)my_NotationDeclHandler}, {"StartNamespaceDeclHandler", (xmlhandlersetter)XML_SetStartNamespaceDeclHandler, - (xmlhandler)my_StartNamespaceDeclHandler }, + (xmlhandler)my_StartNamespaceDeclHandler}, {"EndNamespaceDeclHandler", (xmlhandlersetter)XML_SetEndNamespaceDeclHandler, - (xmlhandler)my_EndNamespaceDeclHandler }, + (xmlhandler)my_EndNamespaceDeclHandler}, {"CommentHandler", (xmlhandlersetter)XML_SetCommentHandler, (xmlhandler)my_CommentHandler}, @@ -1688,7 +1825,7 @@ statichere struct HandlerInfo handler_info[] = { (xmlhandler)my_NotStandaloneHandler}, {"ExternalEntityRefHandler", (xmlhandlersetter)XML_SetExternalEntityRefHandler, - (xmlhandler)my_ExternalEntityRefHandler }, + (xmlhandler)my_ExternalEntityRefHandler}, {"StartDoctypeDeclHandler", (xmlhandlersetter)XML_SetStartDoctypeDeclHandler, (xmlhandler)my_StartDoctypeDeclHandler}, |