diff options
author | Zackery Spytz <zspytz@gmail.com> | 2020-08-09 10:50:53 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-09 10:50:53 (GMT) |
commit | e28b8c93878072dc02b116108ef5443084290d47 (patch) | |
tree | 5a5a398cdc0dbb9f8c78fe9d37ed15b7e3ce89ce /Lib | |
parent | 67acf74c4eaf64a860cc1bcda6efe6e9cb01f89b (diff) | |
download | cpython-e28b8c93878072dc02b116108ef5443084290d47.zip cpython-e28b8c93878072dc02b116108ef5443084290d47.tar.gz cpython-e28b8c93878072dc02b116108ef5443084290d47.tar.bz2 |
bpo-35018: Sax parser should provide user access to lexical handlers (GH-20958)
Co-Authored-By: Jonathan Gossage <jgossage@gmail.com>
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/test/test_sax.py | 157 | ||||
-rw-r--r-- | Lib/xml/sax/handler.py | 45 |
2 files changed, 200 insertions, 2 deletions
diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py index cfc674b..801143f 100644 --- a/Lib/test/test_sax.py +++ b/Lib/test/test_sax.py @@ -13,7 +13,8 @@ except SAXReaderNotAvailable: from xml.sax.saxutils import XMLGenerator, escape, unescape, quoteattr, \ XMLFilterBase, prepare_input_source from xml.sax.expatreader import create_parser -from xml.sax.handler import feature_namespaces, feature_external_ges +from xml.sax.handler import (feature_namespaces, feature_external_ges, + LexicalHandler) from xml.sax.xmlreader import InputSource, AttributesImpl, AttributesNSImpl from io import BytesIO, StringIO import codecs @@ -1356,6 +1357,155 @@ class XmlReaderTest(XmlTestBase): self.assertEqual(attrs.getQNameByName((ns_uri, "attr")), "ns:attr") +class LexicalHandlerTest(unittest.TestCase): + def setUp(self): + self.parser = None + + self.specified_version = '1.0' + self.specified_encoding = 'UTF-8' + self.specified_doctype = 'wish' + self.specified_entity_names = ('nbsp', 'source', 'target') + self.specified_comment = ('Comment in a DTD', + 'Really! You think so?') + self.test_data = StringIO() + self.test_data.write('<?xml version="{}" encoding="{}"?>\n'. + format(self.specified_version, + self.specified_encoding)) + self.test_data.write('<!DOCTYPE {} [\n'. + format(self.specified_doctype)) + self.test_data.write('<!-- {} -->\n'. + format(self.specified_comment[0])) + self.test_data.write('<!ELEMENT {} (to,from,heading,body,footer)>\n'. + format(self.specified_doctype)) + self.test_data.write('<!ELEMENT to (#PCDATA)>\n') + self.test_data.write('<!ELEMENT from (#PCDATA)>\n') + self.test_data.write('<!ELEMENT heading (#PCDATA)>\n') + self.test_data.write('<!ELEMENT body (#PCDATA)>\n') + self.test_data.write('<!ELEMENT footer (#PCDATA)>\n') + self.test_data.write('<!ENTITY {} " ">\n'. + format(self.specified_entity_names[0])) + self.test_data.write('<!ENTITY {} "Written by: Alexander.">\n'. + format(self.specified_entity_names[1])) + self.test_data.write('<!ENTITY {} "Hope it gets to: Aristotle.">\n'. + format(self.specified_entity_names[2])) + self.test_data.write(']>\n') + self.test_data.write('<{}>'.format(self.specified_doctype)) + self.test_data.write('<to>Aristotle</to>\n') + self.test_data.write('<from>Alexander</from>\n') + self.test_data.write('<heading>Supplication</heading>\n') + self.test_data.write('<body>Teach me patience!</body>\n') + self.test_data.write('<footer>&{};&{};&{};</footer>\n'. + format(self.specified_entity_names[1], + self.specified_entity_names[0], + self.specified_entity_names[2])) + self.test_data.write('<!-- {} -->\n'.format(self.specified_comment[1])) + self.test_data.write('</{}>\n'.format(self.specified_doctype)) + self.test_data.seek(0) + + # Data received from handlers - to be validated + self.version = None + self.encoding = None + self.standalone = None + self.doctype = None + self.publicID = None + self.systemID = None + self.end_of_dtd = False + self.comments = [] + + def test_handlers(self): + class TestLexicalHandler(LexicalHandler): + def __init__(self, test_harness, *args, **kwargs): + super().__init__(*args, **kwargs) + self.test_harness = test_harness + + def startDTD(self, doctype, publicID, systemID): + self.test_harness.doctype = doctype + self.test_harness.publicID = publicID + self.test_harness.systemID = systemID + + def endDTD(self): + self.test_harness.end_of_dtd = True + + def comment(self, text): + self.test_harness.comments.append(text) + + self.parser = create_parser() + self.parser.setContentHandler(ContentHandler()) + self.parser.setProperty( + 'http://xml.org/sax/properties/lexical-handler', + TestLexicalHandler(self)) + source = InputSource() + source.setCharacterStream(self.test_data) + self.parser.parse(source) + self.assertEqual(self.doctype, self.specified_doctype) + self.assertIsNone(self.publicID) + self.assertIsNone(self.systemID) + self.assertTrue(self.end_of_dtd) + self.assertEqual(len(self.comments), + len(self.specified_comment)) + self.assertEqual(f' {self.specified_comment[0]} ', self.comments[0]) + + +class CDATAHandlerTest(unittest.TestCase): + def setUp(self): + self.parser = None + self.specified_chars = [] + self.specified_chars.append(('Parseable character data', False)) + self.specified_chars.append(('<> &% - assorted other XML junk.', True)) + self.char_index = 0 # Used to index specified results within handlers + self.test_data = StringIO() + self.test_data.write('<root_doc>\n') + self.test_data.write('<some_pcdata>\n') + self.test_data.write(f'{self.specified_chars[0][0]}\n') + self.test_data.write('</some_pcdata>\n') + self.test_data.write('<some_cdata>\n') + self.test_data.write(f'<![CDATA[{self.specified_chars[1][0]}]]>\n') + self.test_data.write('</some_cdata>\n') + self.test_data.write('</root_doc>\n') + self.test_data.seek(0) + + # Data received from handlers - to be validated + self.chardata = [] + self.in_cdata = False + + def test_handlers(self): + class TestLexicalHandler(LexicalHandler): + def __init__(self, test_harness, *args, **kwargs): + super().__init__(*args, **kwargs) + self.test_harness = test_harness + + def startCDATA(self): + self.test_harness.in_cdata = True + + def endCDATA(self): + self.test_harness.in_cdata = False + + class TestCharHandler(ContentHandler): + def __init__(self, test_harness, *args, **kwargs): + super().__init__(*args, **kwargs) + self.test_harness = test_harness + + def characters(self, content): + if content != '\n': + h = self.test_harness + t = h.specified_chars[h.char_index] + h.assertEqual(t[0], content) + h.assertEqual(t[1], h.in_cdata) + h.char_index += 1 + + self.parser = create_parser() + self.parser.setContentHandler(TestCharHandler(self)) + self.parser.setProperty( + 'http://xml.org/sax/properties/lexical-handler', + TestLexicalHandler(self)) + source = InputSource() + source.setCharacterStream(self.test_data) + self.parser.parse(source) + + self.assertFalse(self.in_cdata) + self.assertEqual(self.char_index, 2) + + def test_main(): run_unittest(MakeParserTest, ParseTest, @@ -1368,7 +1518,10 @@ def test_main(): StreamReaderWriterXmlgenTest, ExpatReaderTest, ErrorReportingTest, - XmlReaderTest) + XmlReaderTest, + LexicalHandlerTest, + CDATAHandlerTest) + if __name__ == "__main__": test_main() diff --git a/Lib/xml/sax/handler.py b/Lib/xml/sax/handler.py index 481733d..e8d417e 100644 --- a/Lib/xml/sax/handler.py +++ b/Lib/xml/sax/handler.py @@ -340,3 +340,48 @@ all_properties = [property_lexical_handler, property_xml_string, property_encoding, property_interning_dict] + + +class LexicalHandler: + """Optional SAX2 handler for lexical events. + + This handler is used to obtain lexical information about an XML + document, that is, information about how the document was encoded + (as opposed to what it contains, which is reported to the + ContentHandler), such as comments and CDATA marked section + boundaries. + + To set the LexicalHandler of an XMLReader, use the setProperty + method with the property identifier + 'http://xml.org/sax/properties/lexical-handler'.""" + + def comment(self, content): + """Reports a comment anywhere in the document (including the + DTD and outside the document element). + + content is a string that holds the contents of the comment.""" + + def startDTD(self, name, public_id, system_id): + """Report the start of the DTD declarations, if the document + has an associated DTD. + + A startEntity event will be reported before declaration events + from the external DTD subset are reported, and this can be + used to infer from which subset DTD declarations derive. + + name is the name of the document element type, public_id the + public identifier of the DTD (or None if none were supplied) + and system_id the system identfier of the external subset (or + None if none were supplied).""" + + def endDTD(self): + """Signals the end of DTD declarations.""" + + def startCDATA(self): + """Reports the beginning of a CDATA marked section. + + The contents of the CDATA marked section will be reported + through the characters event.""" + + def endCDATA(self): + """Reports the end of a CDATA marked section.""" |