path: root/Lib
diff options
authorZackery Spytz <>2020-08-09 10:50:53 (GMT)
committerGitHub <>2020-08-09 10:50:53 (GMT)
commite28b8c93878072dc02b116108ef5443084290d47 (patch)
tree5a5a398cdc0dbb9f8c78fe9d37ed15b7e3ce89ce /Lib
parent67acf74c4eaf64a860cc1bcda6efe6e9cb01f89b (diff)
bpo-35018: Sax parser should provide user access to lexical handlers (GH-20958)
Co-Authored-By: Jonathan Gossage <>
Diffstat (limited to 'Lib')
2 files changed, 200 insertions, 2 deletions
diff --git a/Lib/test/ b/Lib/test/
index cfc674b..801143f 100644
--- a/Lib/test/
+++ b/Lib/test/
@@ -13,7 +13,8 @@ except SAXReaderNotAvailable:
from xml.sax.saxutils import XMLGenerator, escape, unescape, quoteattr, \
XMLFilterBase, prepare_input_source
from xml.sax.expatreader import create_parser
-from xml.sax.handler import feature_namespaces, feature_external_ges
+from xml.sax.handler import (feature_namespaces, feature_external_ges,
+ LexicalHandler)
from xml.sax.xmlreader import InputSource, AttributesImpl, AttributesNSImpl
from io import BytesIO, StringIO
import codecs
@@ -1356,6 +1357,155 @@ class XmlReaderTest(XmlTestBase):
self.assertEqual(attrs.getQNameByName((ns_uri, "attr")), "ns:attr")
+class LexicalHandlerTest(unittest.TestCase):
+ def setUp(self):
+ self.parser = None
+ self.specified_version = '1.0'
+ self.specified_encoding = 'UTF-8'
+ self.specified_doctype = 'wish'
+ self.specified_entity_names = ('nbsp', 'source', 'target')
+ self.specified_comment = ('Comment in a DTD',
+ 'Really! You think so?')
+ self.test_data = StringIO()
+ self.test_data.write('<?xml version="{}" encoding="{}"?>\n'.
+ format(self.specified_version,
+ self.specified_encoding))
+ self.test_data.write('<!DOCTYPE {} [\n'.
+ format(self.specified_doctype))
+ self.test_data.write('<!-- {} -->\n'.
+ format(self.specified_comment[0]))
+ self.test_data.write('<!ELEMENT {} (to,from,heading,body,footer)>\n'.
+ format(self.specified_doctype))
+ self.test_data.write('<!ELEMENT to (#PCDATA)>\n')
+ self.test_data.write('<!ELEMENT from (#PCDATA)>\n')
+ self.test_data.write('<!ELEMENT heading (#PCDATA)>\n')
+ self.test_data.write('<!ELEMENT body (#PCDATA)>\n')
+ self.test_data.write('<!ELEMENT footer (#PCDATA)>\n')
+ self.test_data.write('<!ENTITY {} "&#xA0;">\n'.
+ format(self.specified_entity_names[0]))
+ self.test_data.write('<!ENTITY {} "Written by: Alexander.">\n'.
+ format(self.specified_entity_names[1]))
+ self.test_data.write('<!ENTITY {} "Hope it gets to: Aristotle.">\n'.
+ format(self.specified_entity_names[2]))
+ self.test_data.write(']>\n')
+ self.test_data.write('<{}>'.format(self.specified_doctype))
+ self.test_data.write('<to>Aristotle</to>\n')
+ self.test_data.write('<from>Alexander</from>\n')
+ self.test_data.write('<heading>Supplication</heading>\n')
+ self.test_data.write('<body>Teach me patience!</body>\n')
+ self.test_data.write('<footer>&{};&{};&{};</footer>\n'.
+ format(self.specified_entity_names[1],
+ self.specified_entity_names[0],
+ self.specified_entity_names[2]))
+ self.test_data.write('<!-- {} -->\n'.format(self.specified_comment[1]))
+ self.test_data.write('</{}>\n'.format(self.specified_doctype))
+ # Data received from handlers - to be validated
+ self.version = None
+ self.encoding = None
+ self.standalone = None
+ self.doctype = None
+ self.publicID = None
+ self.systemID = None
+ self.end_of_dtd = False
+ self.comments = []
+ def test_handlers(self):
+ class TestLexicalHandler(LexicalHandler):
+ def __init__(self, test_harness, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.test_harness = test_harness
+ def startDTD(self, doctype, publicID, systemID):
+ self.test_harness.doctype = doctype
+ self.test_harness.publicID = publicID
+ self.test_harness.systemID = systemID
+ def endDTD(self):
+ self.test_harness.end_of_dtd = True
+ def comment(self, text):
+ self.test_harness.comments.append(text)
+ self.parser = create_parser()
+ self.parser.setContentHandler(ContentHandler())
+ self.parser.setProperty(
+ '',
+ TestLexicalHandler(self))
+ source = InputSource()
+ source.setCharacterStream(self.test_data)
+ self.parser.parse(source)
+ self.assertEqual(self.doctype, self.specified_doctype)
+ self.assertIsNone(self.publicID)
+ self.assertIsNone(self.systemID)
+ self.assertTrue(self.end_of_dtd)
+ self.assertEqual(len(self.comments),
+ len(self.specified_comment))
+ self.assertEqual(f' {self.specified_comment[0]} ', self.comments[0])
+class CDATAHandlerTest(unittest.TestCase):
+ def setUp(self):
+ self.parser = None
+ self.specified_chars = []
+ self.specified_chars.append(('Parseable character data', False))
+ self.specified_chars.append(('<> &% - assorted other XML junk.', True))
+ self.char_index = 0 # Used to index specified results within handlers
+ self.test_data = StringIO()
+ self.test_data.write('<root_doc>\n')
+ self.test_data.write('<some_pcdata>\n')
+ self.test_data.write(f'{self.specified_chars[0][0]}\n')
+ self.test_data.write('</some_pcdata>\n')
+ self.test_data.write('<some_cdata>\n')
+ self.test_data.write(f'<![CDATA[{self.specified_chars[1][0]}]]>\n')
+ self.test_data.write('</some_cdata>\n')
+ self.test_data.write('</root_doc>\n')
+ # Data received from handlers - to be validated
+ self.chardata = []
+ self.in_cdata = False
+ def test_handlers(self):
+ class TestLexicalHandler(LexicalHandler):
+ def __init__(self, test_harness, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.test_harness = test_harness
+ def startCDATA(self):
+ self.test_harness.in_cdata = True
+ def endCDATA(self):
+ self.test_harness.in_cdata = False
+ class TestCharHandler(ContentHandler):
+ def __init__(self, test_harness, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.test_harness = test_harness
+ def characters(self, content):
+ if content != '\n':
+ h = self.test_harness
+ t = h.specified_chars[h.char_index]
+ h.assertEqual(t[0], content)
+ h.assertEqual(t[1], h.in_cdata)
+ h.char_index += 1
+ self.parser = create_parser()
+ self.parser.setContentHandler(TestCharHandler(self))
+ self.parser.setProperty(
+ '',
+ TestLexicalHandler(self))
+ source = InputSource()
+ source.setCharacterStream(self.test_data)
+ self.parser.parse(source)
+ self.assertFalse(self.in_cdata)
+ self.assertEqual(self.char_index, 2)
def test_main():
@@ -1368,7 +1518,10 @@ def test_main():
- XmlReaderTest)
+ XmlReaderTest,
+ LexicalHandlerTest,
+ CDATAHandlerTest)
if __name__ == "__main__":
diff --git a/Lib/xml/sax/ b/Lib/xml/sax/
index 481733d..e8d417e 100644
--- a/Lib/xml/sax/
+++ b/Lib/xml/sax/
@@ -340,3 +340,48 @@ all_properties = [property_lexical_handler,
+class LexicalHandler:
+ """Optional SAX2 handler for lexical events.
+ This handler is used to obtain lexical information about an XML
+ document, that is, information about how the document was encoded
+ (as opposed to what it contains, which is reported to the
+ ContentHandler), such as comments and CDATA marked section
+ boundaries.
+ To set the LexicalHandler of an XMLReader, use the setProperty
+ method with the property identifier
+ ''."""
+ def comment(self, content):
+ """Reports a comment anywhere in the document (including the
+ DTD and outside the document element).
+ content is a string that holds the contents of the comment."""
+ def startDTD(self, name, public_id, system_id):
+ """Report the start of the DTD declarations, if the document
+ has an associated DTD.
+ A startEntity event will be reported before declaration events
+ from the external DTD subset are reported, and this can be
+ used to infer from which subset DTD declarations derive.
+ name is the name of the document element type, public_id the
+ public identifier of the DTD (or None if none were supplied)
+ and system_id the system identfier of the external subset (or
+ None if none were supplied)."""
+ def endDTD(self):
+ """Signals the end of DTD declarations."""
+ def startCDATA(self):
+ """Reports the beginning of a CDATA marked section.
+ The contents of the CDATA marked section will be reported
+ through the characters event."""
+ def endCDATA(self):
+ """Reports the end of a CDATA marked section."""