# XXX TypeErrors on calling handlers, or on bad return values from a # handler, are obscure and unhelpful. import StringIO, sys import unittest from xml.parsers import expat from test import test_support from test.test_support import sortdict, run_unittest class SetAttributeTest(unittest.TestCase): def setUp(self): self.parser = expat.ParserCreate(namespace_separator='!') def test_buffer_text(self): self.assertIs(self.parser.buffer_text, False) for x in 0, 1, 2, 0: self.parser.buffer_text = x self.assertIs(self.parser.buffer_text, bool(x)) def test_namespace_prefixes(self): self.assertIs(self.parser.namespace_prefixes, False) for x in 0, 1, 2, 0: self.parser.namespace_prefixes = x self.assertIs(self.parser.namespace_prefixes, bool(x)) def test_returns_unicode(self): self.assertIs(self.parser.returns_unicode, test_support.have_unicode) for x in 0, 1, 2, 0: self.parser.returns_unicode = x self.assertIs(self.parser.returns_unicode, bool(x)) def test_ordered_attributes(self): self.assertIs(self.parser.ordered_attributes, False) for x in 0, 1, 2, 0: self.parser.ordered_attributes = x self.assertIs(self.parser.ordered_attributes, bool(x)) def test_specified_attributes(self): self.assertIs(self.parser.specified_attributes, False) for x in 0, 1, 2, 0: self.parser.specified_attributes = x self.assertIs(self.parser.specified_attributes, bool(x)) def test_invalid_attributes(self): with self.assertRaises(AttributeError): self.parser.foo = 1 with self.assertRaises(AttributeError): self.parser.foo data = '''\ %unparsed_entity; ]> Contents of subelements &external_entity; ''' # Produce UTF-8 output class ParseTest(unittest.TestCase): class Outputter: def __init__(self): self.out = [] def StartElementHandler(self, name, attrs): self.out.append('Start element: ' + repr(name) + ' ' + sortdict(attrs)) def EndElementHandler(self, name): self.out.append('End element: ' + repr(name)) def CharacterDataHandler(self, data): data = data.strip() if data: self.out.append('Character data: ' + repr(data)) def ProcessingInstructionHandler(self, target, data): self.out.append('PI: ' + repr(target) + ' ' + repr(data)) def StartNamespaceDeclHandler(self, prefix, uri): self.out.append('NS decl: ' + repr(prefix) + ' ' + repr(uri)) def EndNamespaceDeclHandler(self, prefix): self.out.append('End of NS decl: ' + repr(prefix)) def StartCdataSectionHandler(self): self.out.append('Start of CDATA section') def EndCdataSectionHandler(self): self.out.append('End of CDATA section') def CommentHandler(self, text): self.out.append('Comment: ' + repr(text)) def NotationDeclHandler(self, *args): name, base, sysid, pubid = args self.out.append('Notation declared: %s' %(args,)) def UnparsedEntityDeclHandler(self, *args): entityName, base, systemId, publicId, notationName = args self.out.append('Unparsed entity decl: %s' %(args,)) def NotStandaloneHandler(self, userData): self.out.append('Not standalone') return 1 def ExternalEntityRefHandler(self, *args): context, base, sysId, pubId = args self.out.append('External entity ref: %s' %(args[1:],)) return 1 def DefaultHandler(self, userData): pass def DefaultHandlerExpand(self, userData): pass handler_names = [ 'StartElementHandler', 'EndElementHandler', 'CharacterDataHandler', 'ProcessingInstructionHandler', 'UnparsedEntityDeclHandler', 'NotationDeclHandler', 'StartNamespaceDeclHandler', 'EndNamespaceDeclHandler', 'CommentHandler', 'StartCdataSectionHandler', 'EndCdataSectionHandler', 'DefaultHandler', 'DefaultHandlerExpand', #'NotStandaloneHandler', 'ExternalEntityRefHandler' ] def test_utf8(self): out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') for name in self.handler_names: setattr(parser, name, getattr(out, name)) parser.returns_unicode = 0 parser.Parse(data, 1) # Verify output op = out.out self.assertEqual(op[0], 'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'') self.assertEqual(op[1], "Comment: ' comment data '") self.assertEqual(op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)") self.assertEqual(op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')") self.assertEqual(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\xe1\\xbd\\x80'}") self.assertEqual(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'") self.assertEqual(op[6], "Start element: 'http://www.python.org/namespace!subelement' {}") self.assertEqual(op[7], "Character data: 'Contents of subelements'") self.assertEqual(op[8], "End element: 'http://www.python.org/namespace!subelement'") self.assertEqual(op[9], "End of NS decl: 'myns'") self.assertEqual(op[10], "Start element: 'sub2' {}") self.assertEqual(op[11], 'Start of CDATA section') self.assertEqual(op[12], "Character data: 'contents of CDATA section'") self.assertEqual(op[13], 'End of CDATA section') self.assertEqual(op[14], "End element: 'sub2'") self.assertEqual(op[15], "External entity ref: (None, 'entity.file', None)") self.assertEqual(op[16], "End element: 'root'") def test_unicode(self): # Try the parse again, this time producing Unicode output out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') parser.returns_unicode = 1 for name in self.handler_names: setattr(parser, name, getattr(out, name)) parser.Parse(data, 1) op = out.out self.assertEqual(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'') self.assertEqual(op[1], "Comment: u' comment data '") self.assertEqual(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)") self.assertEqual(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')") self.assertEqual(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}") self.assertEqual(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'") self.assertEqual(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}") self.assertEqual(op[7], "Character data: u'Contents of subelements'") self.assertEqual(op[8], "End element: u'http://www.python.org/namespace!subelement'") self.assertEqual(op[9], "End of NS decl: u'myns'") self.assertEqual(op[10], "Start element: u'sub2' {}") self.assertEqual(op[11], 'Start of CDATA section') self.assertEqual(op[12], "Character data: u'contents of CDATA section'") self.assertEqual(op[13], 'End of CDATA section') self.assertEqual(op[14], "End element: u'sub2'") self.assertEqual(op[15], "External entity ref: (None, u'entity.file', None)") self.assertEqual(op[16], "End element: u'root'") def test_parse_file(self): # Try parsing a file out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') parser.returns_unicode = 1 for name in self.handler_names: setattr(parser, name, getattr(out, name)) file = StringIO.StringIO(data) parser.ParseFile(file) op = out.out self.assertEqual(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'') self.assertEqual(op[1], "Comment: u' comment data '") self.assertEqual(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)") self.assertEqual(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')") self.assertEqual(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}") self.assertEqual(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'") self.assertEqual(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}") self.assertEqual(op[7], "Character data: u'Contents of subelements'") self.assertEqual(op[8], "End element: u'http://www.python.org/namespace!subelement'") self.assertEqual(op[9], "End of NS decl: u'myns'") self.assertEqual(op[10], "Start element: u'sub2' {}") self.assertEqual(op[11], 'Start of CDATA section') self.assertEqual(op[12], "Character data: u'contents of CDATA section'") self.assertEqual(op[13], 'End of CDATA section') self.assertEqual(op[14], "End element: u'sub2'") self.assertEqual(op[15], "External entity ref: (None, u'entity.file', None)") self.assertEqual(op[16], "End element: u'root'") # Issue 4877: expat.ParseFile causes segfault on a closed file. fp = open(test_support.TESTFN, 'wb') try: fp.close() parser = expat.ParserCreate() with self.assertRaises(ValueError): parser.ParseFile(fp) finally: test_support.unlink(test_support.TESTFN) def test_parse_again(self): parser = expat.ParserCreate() file = StringIO.StringIO(data) parser.ParseFile(file) # Issue 6676: ensure a meaningful exception is raised when attempting # to parse more than one XML document per xmlparser instance, # a limitation of the Expat library. with self.assertRaises(expat.error) as cm: parser.ParseFile(file) self.assertEqual(expat.ErrorString(cm.exception.code), expat.errors.XML_ERROR_FINISHED) class NamespaceSeparatorTest(unittest.TestCase): def test_legal(self): # Tests that make sure we get errors when the namespace_separator value # is illegal, and that we don't for good values: expat.ParserCreate() expat.ParserCreate(namespace_separator=None) expat.ParserCreate(namespace_separator=' ') def test_illegal(self): try: expat.ParserCreate(namespace_separator=42) self.fail() except TypeError, e: self.assertEqual(str(e), 'ParserCreate() argument 2 must be string or None, not int') try: expat.ParserCreate(namespace_separator='too long') self.fail() except ValueError, e: self.assertEqual(str(e), 'namespace_separator must be at most one character, omitted, or None') def test_zero_length(self): # ParserCreate() needs to accept a namespace_separator of zero length # to satisfy the requirements of RDF applications that are required # to simply glue together the namespace URI and the localname. Though # considered a wart of the RDF specifications, it needs to be supported. # # See XML-SIG mailing list thread starting with # http://mail.python.org/pipermail/xml-sig/2001-April/005202.html # expat.ParserCreate(namespace_separator='') # too short class InterningTest(unittest.TestCase): def test(self): # Test the interning machinery. p = expat.ParserCreate() L = [] def collector(name, *args): L.append(name) p.StartElementHandler = collector p.EndElementHandler = collector p.Parse(" ", 1) tag = L[0] self.assertEqual(len(L), 6) for entry in L: # L should have the same string repeated over and over. self.assertTrue(tag is entry) class BufferTextTest(unittest.TestCase): def setUp(self): self.stuff = [] self.parser = expat.ParserCreate() self.parser.buffer_text = 1 self.parser.CharacterDataHandler = self.CharacterDataHandler def check(self, expected, label): self.assertEqual(self.stuff, expected, "%s\nstuff = %r\nexpected = %r" % (label, self.stuff, map(unicode, expected))) def CharacterDataHandler(self, text): self.stuff.append(text) def StartElementHandler(self, name, attrs): self.stuff.append("<%s>" % name) bt = attrs.get("buffer-text") if bt == "yes": self.parser.buffer_text = 1 elif bt == "no": self.parser.buffer_text = 0 def EndElementHandler(self, name): self.stuff.append("" % name) def CommentHandler(self, data): self.stuff.append("" % data) def setHandlers(self, handlers=[]): for name in handlers: setattr(self.parser, name, getattr(self, name)) def test_default_to_disabled(self): parser = expat.ParserCreate() self.assertFalse(parser.buffer_text) def test_buffering_enabled(self): # Make sure buffering is turned on self.assertTrue(self.parser.buffer_text) self.parser.Parse("123", 1) self.assertEqual(self.stuff, ['123'], "buffered text not properly collapsed") def test1(self): # XXX This test exposes more detail of Expat's text chunking than we # XXX like, but it tests what we need to concisely. self.setHandlers(["StartElementHandler"]) self.parser.Parse("12\n34\n5", 1) self.assertEqual(self.stuff, ["", "1", "", "2", "\n", "3", "", "4\n5"], "buffering control not reacting as expected") def test2(self): self.parser.Parse("1<2> \n 3", 1) self.assertEqual(self.stuff, ["1<2> \n 3"], "buffered text not properly collapsed") def test3(self): self.setHandlers(["StartElementHandler"]) self.parser.Parse("123", 1) self.assertEqual(self.stuff, ["", "1", "", "2", "", "3"], "buffered text not properly split") def test4(self): self.setHandlers(["StartElementHandler", "EndElementHandler"]) self.parser.CharacterDataHandler = None self.parser.Parse("123", 1) self.assertEqual(self.stuff, ["", "", "", "", "", ""]) def test5(self): self.setHandlers(["StartElementHandler", "EndElementHandler"]) self.parser.Parse("123", 1) self.assertEqual(self.stuff, ["", "1", "", "", "2", "", "", "3", ""]) def test6(self): self.setHandlers(["CommentHandler", "EndElementHandler", "StartElementHandler"]) self.parser.Parse("12345 ", 1) self.assertEqual(self.stuff, ["", "1", "", "", "2", "", "", "345", ""], "buffered text not properly split") def test7(self): self.setHandlers(["CommentHandler", "EndElementHandler", "StartElementHandler"]) self.parser.Parse("12345 ", 1) self.assertEqual(self.stuff, ["", "1", "", "", "2", "", "", "3", "", "4", "", "5", ""], "buffered text not properly split") # Test handling of exception from callback: class HandlerExceptionTest(unittest.TestCase): def StartElementHandler(self, name, attrs): raise RuntimeError(name) def test(self): parser = expat.ParserCreate() parser.StartElementHandler = self.StartElementHandler try: parser.Parse("", 1) self.fail() except RuntimeError, e: self.assertEqual(e.args[0], 'a', "Expected RuntimeError for element 'a', but" + \ " found %r" % e.args[0]) # Test Current* members: class PositionTest(unittest.TestCase): def StartElementHandler(self, name, attrs): self.check_pos('s') def EndElementHandler(self, name): self.check_pos('e') def check_pos(self, event): pos = (event, self.parser.CurrentByteIndex, self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber) self.assertTrue(self.upto < len(self.expected_list), 'too many parser events') expected = self.expected_list[self.upto] self.assertEqual(pos, expected, 'Expected position %s, got position %s' %(pos, expected)) self.upto += 1 def test(self): self.parser = expat.ParserCreate() self.parser.StartElementHandler = self.StartElementHandler self.parser.EndElementHandler = self.EndElementHandler self.upto = 0 self.expected_list = [('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2), ('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)] xml = '\n \n \n \n' self.parser.Parse(xml, 1) class sf1296433Test(unittest.TestCase): def test_parse_only_xml_data(self): # http://python.org/sf/1296433 # xml = "%s" % ('a' * 1025) # this one doesn't crash #xml = "%s" % ('a' * 10000) class SpecificException(Exception): pass def handler(text): raise SpecificException parser = expat.ParserCreate() parser.CharacterDataHandler = handler self.assertRaises(Exception, parser.Parse, xml) class ChardataBufferTest(unittest.TestCase): """ test setting of chardata buffer size """ def test_1025_bytes(self): self.assertEqual(self.small_buffer_test(1025), 2) def test_1000_bytes(self): self.assertEqual(self.small_buffer_test(1000), 1) def test_wrong_size(self): parser = expat.ParserCreate() parser.buffer_text = 1 with self.assertRaises(ValueError): parser.buffer_size = -1 with self.assertRaises(ValueError): parser.buffer_size = 0 with self.assertRaises(TypeError): parser.buffer_size = 512.0 with self.assertRaises(TypeError): parser.buffer_size = sys.maxint+1 def test_unchanged_size(self): xml1 = ("%s" % ('a' * 512)) xml2 = 'a'*512 + '' parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_size = 512 parser.buffer_text = 1 # Feed 512 bytes of character data: the handler should be called # once. self.n = 0 parser.Parse(xml1) self.assertEqual(self.n, 1) # Reassign to buffer_size, but assign the same size. parser.buffer_size = parser.buffer_size self.assertEqual(self.n, 1) # Try parsing rest of the document parser.Parse(xml2) self.assertEqual(self.n, 2) def test_disabling_buffer(self): xml1 = "%s" % ('a' * 512) xml2 = ('b' * 1024) xml3 = "%s" % ('c' * 1024) parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_text = 1 parser.buffer_size = 1024 self.assertEqual(parser.buffer_size, 1024) # Parse one chunk of XML self.n = 0 parser.Parse(xml1, 0) self.assertEqual(parser.buffer_size, 1024) self.assertEqual(self.n, 1) # Turn off buffering and parse the next chunk. parser.buffer_text = 0 self.assertFalse(parser.buffer_text) self.assertEqual(parser.buffer_size, 1024) for i in range(10): parser.Parse(xml2, 0) self.assertEqual(self.n, 11) parser.buffer_text = 1 self.assertTrue(parser.buffer_text) self.assertEqual(parser.buffer_size, 1024) parser.Parse(xml3, 1) self.assertEqual(self.n, 12) def make_document(self, bytes): return ("" + bytes * 'a' + '') def counting_handler(self, text): self.n += 1 def small_buffer_test(self, buffer_len): xml = "%s" % ('a' * buffer_len) parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_size = 1024 parser.buffer_text = 1 self.n = 0 parser.Parse(xml) return self.n def test_change_size_1(self): xml1 = "%s" % ('a' * 1024) xml2 = "aaa%s" % ('a' * 1025) parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_text = 1 parser.buffer_size = 1024 self.assertEqual(parser.buffer_size, 1024) self.n = 0 parser.Parse(xml1, 0) parser.buffer_size *= 2 self.assertEqual(parser.buffer_size, 2048) parser.Parse(xml2, 1) self.assertEqual(self.n, 2) def test_change_size_2(self): xml1 = "a%s" % ('a' * 1023) xml2 = "aaa%s" % ('a' * 1025) parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_text = 1 parser.buffer_size = 2048 self.assertEqual(parser.buffer_size, 2048) self.n=0 parser.Parse(xml1, 0) parser.buffer_size //= 2 self.assertEqual(parser.buffer_size, 1024) parser.Parse(xml2, 1) self.assertEqual(self.n, 4) class MalformedInputText(unittest.TestCase): def test1(self): xml = "\0\r\n" parser = expat.ParserCreate() try: parser.Parse(xml, True) self.fail() except expat.ExpatError as e: self.assertEqual(str(e), 'unclosed token: line 2, column 0') def test2(self): xml = "\r\n" parser = expat.ParserCreate() try: parser.Parse(xml, True) self.fail() except expat.ExpatError as e: self.assertEqual(str(e), 'XML declaration not well-formed: line 1, column 14') class ForeignDTDTests(unittest.TestCase): """ Tests for the UseForeignDTD method of expat parser objects. """ def test_use_foreign_dtd(self): """ If UseForeignDTD is passed True and a document without an external entity reference is parsed, ExternalEntityRefHandler is first called with None for the public and system ids. """ handler_call_args = [] def resolve_entity(context, base, system_id, public_id): handler_call_args.append((public_id, system_id)) return 1 parser = expat.ParserCreate() parser.UseForeignDTD(True) parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.ExternalEntityRefHandler = resolve_entity parser.Parse("") self.assertEqual(handler_call_args, [(None, None)]) # test UseForeignDTD() is equal to UseForeignDTD(True) handler_call_args[:] = [] parser = expat.ParserCreate() parser.UseForeignDTD() parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.ExternalEntityRefHandler = resolve_entity parser.Parse("") self.assertEqual(handler_call_args, [(None, None)]) def test_ignore_use_foreign_dtd(self): """ If UseForeignDTD is passed True and a document with an external entity reference is parsed, ExternalEntityRefHandler is called with the public and system ids from the document. """ handler_call_args = [] def resolve_entity(context, base, system_id, public_id): handler_call_args.append((public_id, system_id)) return 1 parser = expat.ParserCreate() parser.UseForeignDTD(True) parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.ExternalEntityRefHandler = resolve_entity parser.Parse( "") self.assertEqual(handler_call_args, [("bar", "baz")]) def test_main(): run_unittest(SetAttributeTest, ParseTest, NamespaceSeparatorTest, InterningTest, BufferTextTest, HandlerExceptionTest, PositionTest, sf1296433Test, ChardataBufferTest, MalformedInputText, ForeignDTDTests) if __name__ == "__main__": test_main()