#1486713: Add a tolerant mode to HTMLParser.

The motivation for adding this option is that the the functionality it provides used to be provided by sgmllib in Python2, and was used by, for example, BeautifulSoup. Without this option, the Python3 version of BeautifulSoup and the many programs that use it are crippled. The original patch was by 'kxroberto'. I modified it heavily but kept his heuristics and test. I also added additional heuristics to fix #975556, #1046092, and part of #6191. This patch should be completely backward compatible: the behavior with the default strict=True is unchanged.
author: R. David Murray <rdmurray@bitdance.com> 2010-12-03 04:06:39 (GMT)
committer: R. David Murray <rdmurray@bitdance.com> 2010-12-03 04:06:39 (GMT)
commit: b579dba1195df97f87ba868a5987f18fb7509bff (patch)
tree: d1ff2cf38f061ee0bba08459167e33daa7a4ad79 /Lib/test/test_htmlparser.py
parent: 79cdb661f5a6cf8bba07aa50f4451f6c409bb067 (diff)
download: cpython-b579dba1195df97f87ba868a5987f18fb7509bff.zip
cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.gz
cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.bz2
1 files changed, 42 insertions, 6 deletions
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index e982218..beaf6b6 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -8,10 +8,10 @@ from test import support
 
 class EventCollector(html.parser.HTMLParser):
 
-    def __init__(self):
+    def __init__(self, *args, **kw):
         self.events = []
         self.append = self.events.append
-        html.parser.HTMLParser.__init__(self)
+        html.parser.HTMLParser.__init__(self, *args, **kw)
 
     def get_events(self):
         # Normalize the list of events so that buffer artefacts don't
@@ -72,8 +72,10 @@ class EventCollectorExtra(EventCollector):
 
 class TestCaseBase(unittest.TestCase):
 
-    def _run_check(self, source, expected_events, collector=EventCollector):
-        parser = collector()
+    def _run_check(self, source, expected_events, collector=None):
+        if collector is None:
+            collector = EventCollector()
+        parser = collector
         for s in source:
             parser.feed(s)
         parser.close()
@@ -84,7 +86,7 @@ class TestCaseBase(unittest.TestCase):
                       "\nReceived:\n" + pprint.pformat(events))
 
     def _run_check_extra(self, source, events):
-        self._run_check(source, events, EventCollectorExtra)
+        self._run_check(source, events, EventCollectorExtra())
 
     def _parse_error(self, source):
         def parse(source=source):
@@ -321,8 +323,42 @@ DOCTYPE html [
                 ])
 
 
+class HTMLParserTolerantTestCase(TestCaseBase):
+
+    def setUp(self):
+        self.collector = EventCollector(strict=False)
+
+    def test_tolerant_parsing(self):
+        self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
+                        '<img src="URL><//img></html</html>', [
+                             ('data', '<html '),
+                             ('starttag', 'html', []),
+                             ('data', 'te>>xt'),
+                             ('entityref', 'a'),
+                             ('data', '<<bc'),
+                             ('endtag', 'a'),
+                             ('endtag', 'html'),
+                             ('data', '\n<img src="URL><//img></html'),
+                             ('endtag', 'html')],
+                        collector = self.collector)
+
+    def test_comma_between_attributes(self):
+        self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
+                        'method="post">', [
+                            ('starttag', 'form',
+                                [('action', '/xxx.php?a=1&b=2&amp'),
+                                 ('method', 'post')])],
+                        collector = self.collector)
+
+    def test_weird_chars_in_unquoted_attribute_values(self):
+        self._run_check('<form action=bogus|&#()value>', [
+                            ('starttag', 'form',
+                                [('action', 'bogus|&#()value')])],
+                        collector = self.collector)
+
+
 def test_main():
-    support.run_unittest(HTMLParserTestCase)
+    support.run_unittest(HTMLParserTestCase, HTMLParserTolerantTestCase)
 
 
 if __name__ == "__main__":
author	R. David Murray <rdmurray@bitdance.com>	2010-12-03 04:06:39 (GMT)
committer	R. David Murray <rdmurray@bitdance.com>	2010-12-03 04:06:39 (GMT)
commit	b579dba1195df97f87ba868a5987f18fb7509bff (patch)
tree	d1ff2cf38f061ee0bba08459167e33daa7a4ad79 /Lib/test/test_htmlparser.py
parent	79cdb661f5a6cf8bba07aa50f4451f6c409bb067 (diff)
download	cpython-b579dba1195df97f87ba868a5987f18fb7509bff.zip cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.gz cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.bz2