diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2013-11-23 17:52:05 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2013-11-23 17:52:05 (GMT) |
commit | 95401c5f6b9f07b094924559177c9b30a1c38998 (patch) | |
tree | 3029ea3bbffc0c53c64275a2e587bbf696a740cb /Lib/test/test_htmlparser.py | |
parent | e7f87e12626d6ae3b9ed8cae8904a6afad580ffc (diff) | |
download | cpython-95401c5f6b9f07b094924559177c9b30a1c38998.zip cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.gz cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.bz2 |
#13633: Added a new convert_charrefs keyword arg to HTMLParser that, when True, automatically converts all character references.
Diffstat (limited to 'Lib/test/test_htmlparser.py')
-rw-r--r-- | Lib/test/test_htmlparser.py | 70 |
1 files changed, 62 insertions, 8 deletions
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 509b3cd..1a480c8 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector): self.append(("starttag_text", self.get_starttag_text())) +class EventCollectorCharrefs(EventCollector): + + def get_events(self): + return self.events + + def handle_charref(self, data): + self.fail('This should never be called with convert_charrefs=True') + + def handle_entityref(self, data): + self.fail('This should never be called with convert_charrefs=True') + + class TestCaseBase(unittest.TestCase): def get_collector(self): @@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase): parser.close() events = parser.get_events() if events != expected_events: - self.fail("received events did not match expected events\n" - "Expected:\n" + pprint.pformat(expected_events) + + self.fail("received events did not match expected events" + + "\nSource:\n" + repr(source) + + "\nExpected:\n" + pprint.pformat(expected_events) + "\nReceived:\n" + pprint.pformat(events)) def _run_check_extra(self, source, events): - self._run_check(source, events, EventCollectorExtra()) + self._run_check(source, events, + EventCollectorExtra(convert_charrefs=False)) def _parse_error(self, source): def parse(source=source): @@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseBase): def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True) + return EventCollector(strict=True, convert_charrefs=False) def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -335,7 +349,7 @@ text self._run_check(s, [("starttag", element_lower, []), ("data", content), ("endtag", element_lower)], - collector=Collector()) + collector=Collector(convert_charrefs=False)) def test_comments(self): html = ("<!-- I'm a valid comment -->" @@ -363,14 +377,54 @@ text ('comment', '[if lte IE 7]>pretty?<![endif]')] self._run_check(html, expected) + def test_convert_charrefs(self): + collector = lambda: EventCollectorCharrefs(convert_charrefs=True) + self.assertTrue(collector().convert_charrefs) + charrefs = ['"', '"', '"', '"', '"', '"'] + # check charrefs in the middle of the text/attributes + expected = [('starttag', 'a', [('href', 'foo"zar')]), + ('data', 'a"z'), ('endtag', 'a')] + for charref in charrefs: + self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref), + expected, collector=collector()) + # check charrefs at the beginning/end of the text/attributes + expected = [('data', '"'), + ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]), + ('data', '"'), ('endtag', 'a'), ('data', '"')] + for charref in charrefs: + self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">' + '{0}</a>{0}'.format(charref), + expected, collector=collector()) + # check charrefs in <script>/<style> elements + for charref in charrefs: + text = 'X'.join([charref]*3) + expected = [('data', '"'), + ('starttag', 'script', []), ('data', text), + ('endtag', 'script'), ('data', '"'), + ('starttag', 'style', []), ('data', text), + ('endtag', 'style'), ('data', '"')] + self._run_check('{1}<script>{0}</script>{1}' + '<style>{0}</style>{1}'.format(text, charref), + expected, collector=collector()) + # check truncated charrefs at the end of the file + html = '&quo &# &#x' + for x in range(1, len(html)): + self._run_check(html[:x], [('data', html[:x])], + collector=collector()) + # check a string with no charrefs + self._run_check('no charrefs here', [('data', 'no charrefs here')], + collector=collector()) + class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): def get_collector(self): - return EventCollector() + return EventCollector(convert_charrefs=False) def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning): + EventCollector() # convert_charrefs not passed explicitly + with self.assertWarns(DeprecationWarning): EventCollector(strict=True) with self.assertWarns(DeprecationWarning): EventCollector(strict=False) @@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseBase): def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True) + return EventCollector(strict=True, convert_charrefs=False) def test_attr_syntax(self): output = [ @@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseBase): class AttributesTolerantTestCase(AttributesStrictTestCase): def get_collector(self): - return EventCollector() + return EventCollector(convert_charrefs=False) def test_attr_funky_names2(self): self._run_check( |