diff options
Diffstat (limited to 'Lib/test/test_htmlparser.py')
-rw-r--r--[-rwxr-xr-x] | Lib/test/test_htmlparser.py | 66 |
1 files changed, 60 insertions, 6 deletions
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 661d41d..637ab01 100755..100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -8,10 +8,10 @@ from test import support class EventCollector(html.parser.HTMLParser): - def __init__(self): + def __init__(self, *args, **kw): self.events = [] self.append = self.events.append - html.parser.HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self, *args, **kw) def get_events(self): # Normalize the list of events so that buffer artefacts don't @@ -72,8 +72,10 @@ class EventCollectorExtra(EventCollector): class TestCaseBase(unittest.TestCase): - def _run_check(self, source, expected_events, collector=EventCollector): - parser = collector() + def _run_check(self, source, expected_events, collector=None): + if collector is None: + collector = EventCollector() + parser = collector for s in source: parser.feed(s) parser.close() @@ -84,7 +86,7 @@ class TestCaseBase(unittest.TestCase): "\nReceived:\n" + pprint.pformat(events)) def _run_check_extra(self, source, events): - self._run_check(source, events, EventCollectorExtra) + self._run_check(source, events, EventCollectorExtra()) def _parse_error(self, source): def parse(source=source): @@ -215,6 +217,23 @@ DOCTYPE html [ ("starttag", "a", [("href", "mailto:xyz@example.com")]), ]) + def test_attr_nonascii(self): + # see issue 7311 + self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [ + ("starttag", "img", [("src", "/foo/bar.png"), + ("alt", "\u4e2d\u6587")]), + ]) + self._run_check("<a title='\u30c6\u30b9\u30c8' " + "href='\u30c6\u30b9\u30c8.html'>", [ + ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")]), + ]) + self._run_check('<a title="\u30c6\u30b9\u30c8" ' + 'href="\u30c6\u30b9\u30c8.html">', [ + ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")]), + ]) + def test_attr_entity_replacement(self): self._run_check("""<a b='&><"''>""", [ ("starttag", "a", [("b", "&><\"'")]), @@ -319,6 +338,41 @@ DOCTYPE html [ self._run_check("<html foo='€&aa&unsupported;'>", [ ("starttag", "html", [("foo", "\u20AC&aa&unsupported;")]) ]) + + +class HTMLParserTolerantTestCase(TestCaseBase): + + def setUp(self): + self.collector = EventCollector(strict=False) + + def test_tolerant_parsing(self): + self._run_check('<html <html>te>>xt&a<<bc</a></html>\n' + '<img src="URL><//img></html</html>', [ + ('data', '<html '), + ('starttag', 'html', []), + ('data', 'te>>xt'), + ('entityref', 'a'), + ('data', '<<bc'), + ('endtag', 'a'), + ('endtag', 'html'), + ('data', '\n<img src="URL><//img></html'), + ('endtag', 'html')], + collector = self.collector) + + def test_comma_between_attributes(self): + self._run_check('<form action="/xxx.php?a=1&b=2&", ' + 'method="post">', [ + ('starttag', 'form', + [('action', '/xxx.php?a=1&b=2&'), + ('method', 'post')])], + collector = self.collector) + + def test_weird_chars_in_unquoted_attribute_values(self): + self._run_check('<form action=bogus|&#()value>', [ + ('starttag', 'form', + [('action', 'bogus|&#()value')])], + collector = self.collector) + def test_unescape_function(self): p = html.parser.HTMLParser() self.assertEqual(p.unescape('&#bad;'),'&#bad;') @@ -326,7 +380,7 @@ DOCTYPE html [ def test_main(): - support.run_unittest(HTMLParserTestCase) + support.run_unittest(HTMLParserTestCase, HTMLParserTolerantTestCase) if __name__ == "__main__": |