#1486713: Add a tolerant mode to HTMLParser.

The motivation for adding this option is that the the functionality it provides used to be provided by sgmllib in Python2, and was used by, for example, BeautifulSoup. Without this option, the Python3 version of BeautifulSoup and the many programs that use it are crippled. The original patch was by 'kxroberto'. I modified it heavily but kept his heuristics and test. I also added additional heuristics to fix #975556, #1046092, and part of #6191. This patch should be completely backward compatible: the behavior with the default strict=True is unchanged.
author: R. David Murray <rdmurray@bitdance.com> 2010-12-03 04:06:39 (GMT)
committer: R. David Murray <rdmurray@bitdance.com> 2010-12-03 04:06:39 (GMT)
commit: b579dba1195df97f87ba868a5987f18fb7509bff (patch)
tree: d1ff2cf38f061ee0bba08459167e33daa7a4ad79
parent: 79cdb661f5a6cf8bba07aa50f4451f6c409bb067 (diff)
download: cpython-b579dba1195df97f87ba868a5987f18fb7509bff.zip
cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.gz
cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.bz2
4 files changed, 139 insertions, 24 deletions
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index 2bc6555..743d183 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -12,9 +12,13 @@
 This module defines a class :class:`HTMLParser` which serves as the basis for
 parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
 
-.. class:: HTMLParser()
+.. class:: HTMLParser(strict=True)
 
-   The :class:`HTMLParser` class is instantiated without arguments.
+   Create a parser instance.  If *strict* is ``True`` (the default), invalid
+   html results in :exc:`~html.parser.HTMLParseError` exceptions [#]_.  If
+   *strict* is ``False``, the parser uses heuristics to make a best guess at
+   the intention of any invalid html it encounters, similar to the way most
+   browsers do.
 
    An :class:`HTMLParser` instance is fed HTML data and calls handler functions when tags
    begin and end.  The :class:`HTMLParser` class is meant to be overridden by the
@@ -191,3 +195,8 @@ As a basic example, below is a very basic HTML parser that uses the
    Encountered a html end tag
 
 
+.. rubric:: Footnotes
+
+.. [#] For backward compatibility reasons *strict* mode does not throw
+       errors for all non-compliant HTML.  That is, some invalid HTML
+       is tolerated even in *strict* mode.
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index c2c7f6b..8d275ab 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -24,10 +24,14 @@ starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
 commentclose = re.compile(r'--\s*>')
 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
+# Note, the strict one of this pair isn't really strict, but we can't
+# make it correctly strict without breaking backward compatibility.
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
-
+attrfind_tolerant = re.compile(
+    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
+    r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
 locatestarttagend = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
   (?:\s+                             # whitespace before attribute name
@@ -42,6 +46,21 @@ locatestarttagend = re.compile(r"""
    )*
   \s*                                # trailing whitespace
 """, re.VERBOSE)
+locatestarttagend_tolerant = re.compile(r"""
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
+  (?:\s*                             # optional whitespace before attribute name
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+      (?:\s*=\s*                     # value indicator
+        (?:'[^']*'                   # LITA-enclosed value
+          |\"[^\"]*\"                # LIT-enclosed value
+          |[^'\">\s]+                # bare value
+         )
+         (?:\s*,)*                   # possibly followed by a comma
+       )?
+     )
+   )*
+  \s*                                # trailing whitespace
+""", re.VERBOSE)
 endendtag = re.compile('>')
 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
@@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase):
 
     CDATA_CONTENT_ELEMENTS = ("script", "style")
 
+    def __init__(self, strict=True):
+        """Initialize and reset this instance.
 
-    def __init__(self):
-        """Initialize and reset this instance."""
+        If strict is set to True (the default), errors are raised when invalid
+        HTML is encountered.  If set to False, an attempt is instead made to
+        continue parsing, making "best guesses" about the intended meaning, in
+        a fashion similar to what browsers typically do.
+        """
+        self.strict = strict
         self.reset()
 
     def reset(self):
@@ -160,9 +185,18 @@ class HTMLParser(_markupbase.ParserBase):
                 else:
                     break
                 if k < 0:
-                    if end:
+                    if not end:
+                        break
+                    if self.strict:
                         self.error("EOF in middle of construct")
-                    break
+                    k = rawdata.find('>', i + 1)
+                    if k < 0:
+                        k = rawdata.find('<', i + 1)
+                        if k < 0:
+                            k = i + 1
+                    else:
+                        k += 1
+                    self.handle_data(rawdata[i:k])
                 i = self.updatepos(i, k)
             elif startswith("&#", i):
                 match = charref.match(rawdata, i)
@@ -193,7 +227,12 @@ class HTMLParser(_markupbase.ParserBase):
                 if match:
                     # match.group() will contain at least 2 chars
                     if end and match.group() == rawdata[i:]:
-                        self.error("EOF in middle of entity or char ref")
+                        if self.strict:
+                            self.error("EOF in middle of entity or char ref")
+                        else:
+                            if k <= i:
+                                k = n
+                            i = self.updatepos(i, i + 1)
                     # incomplete
                     break
                 elif (i + 1) < n:
@@ -240,7 +279,10 @@ class HTMLParser(_markupbase.ParserBase):
         self.lasttag = tag = rawdata[i+1:k].lower()
 
         while k < endpos:
-            m = attrfind.match(rawdata, k)
+            if self.strict:
+                m = attrfind.match(rawdata, k)
+            else:
+                m = attrfind_tolerant.search(rawdata, k)
             if not m:
                 break
             attrname, rest, attrvalue = m.group(1, 2, 3)
@@ -262,8 +304,11 @@ class HTMLParser(_markupbase.ParserBase):
                          - self.__starttag_text.rfind("\n")
             else:
                 offset = offset + len(self.__starttag_text)
-            self.error("junk characters in start tag: %r"
-                       % (rawdata[k:endpos][:20],))
+            if self.strict:
+                self.error("junk characters in start tag: %r"
+                           % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
         if end.endswith('/>'):
             # XHTML-style empty tag: <span attr="value" />
             self.handle_startendtag(tag, attrs)
@@ -277,7 +322,10 @@ class HTMLParser(_markupbase.ParserBase):
     # or -1 if incomplete.
     def check_for_whole_start_tag(self, i):
         rawdata = self.rawdata
-        m = locatestarttagend.match(rawdata, i)
+        if self.strict:
+            m = locatestarttagend.match(rawdata, i)
+        else:
+            m = locatestarttagend_tolerant.match(rawdata, i)
         if m:
             j = m.end()
             next = rawdata[j:j+1]
@@ -290,8 +338,13 @@ class HTMLParser(_markupbase.ParserBase):
                     # buffer boundary
                     return -1
                 # else bogus input
-                self.updatepos(i, j + 1)
-                self.error("malformed empty start tag")
+                if self.strict:
+                    self.updatepos(i, j + 1)
+                    self.error("malformed empty start tag")
+                if j > i:
+                    return j
+                else:
+                    return i + 1
             if next == "":
                 # end of input
                 return -1
@@ -300,8 +353,13 @@ class HTMLParser(_markupbase.ParserBase):
                 # end of input in or before attribute value, or we have the
                 # '/' from a '/>' ending
                 return -1
-            self.updatepos(i, j)
-            self.error("malformed start tag")
+            if self.strict:
+                self.updatepos(i, j)
+                self.error("malformed start tag")
+            if j > i:
+                return j
+            else:
+                return i + 1
         raise AssertionError("we should not get here!")
 
     # Internal -- parse endtag, return end or -1 if incomplete
@@ -314,7 +372,15 @@ class HTMLParser(_markupbase.ParserBase):
         j = match.end()
         match = endtagfind.match(rawdata, i) # </ + tag + >
         if not match:
-            self.error("bad end tag: %r" % (rawdata[i:j],))
+            if self.strict:
+                self.error("bad end tag: %r" % (rawdata[i:j],))
+            k = rawdata.find('<', i + 1, j)
+            if k > i:
+                j = k
+            if j <= i:
+                j = i + 1
+            self.handle_data(rawdata[i:j])
+            return j
         tag = match.group(1)
         self.handle_endtag(tag.lower())
         self.clear_cdata_mode()
@@ -358,7 +424,8 @@ class HTMLParser(_markupbase.ParserBase):
         pass
 
     def unknown_decl(self, data):
-        self.error("unknown declaration: %r" % (data,))
+        if self.strict:
+            self.error("unknown declaration: %r" % (data,))
 
     # Internal -- helper to remove special character quoting
     entitydefs = None
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index e982218..beaf6b6 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -8,10 +8,10 @@ from test import support
 
 class EventCollector(html.parser.HTMLParser):
 
-    def __init__(self):
+    def __init__(self, *args, **kw):
         self.events = []
         self.append = self.events.append
-        html.parser.HTMLParser.__init__(self)
+        html.parser.HTMLParser.__init__(self, *args, **kw)
 
     def get_events(self):
         # Normalize the list of events so that buffer artefacts don't
@@ -72,8 +72,10 @@ class EventCollectorExtra(EventCollector):
 
 class TestCaseBase(unittest.TestCase):
 
-    def _run_check(self, source, expected_events, collector=EventCollector):
-        parser = collector()
+    def _run_check(self, source, expected_events, collector=None):
+        if collector is None:
+            collector = EventCollector()
+        parser = collector
         for s in source:
             parser.feed(s)
         parser.close()
@@ -84,7 +86,7 @@ class TestCaseBase(unittest.TestCase):
                       "\nReceived:\n" + pprint.pformat(events))
 
     def _run_check_extra(self, source, events):
-        self._run_check(source, events, EventCollectorExtra)
+        self._run_check(source, events, EventCollectorExtra())
 
     def _parse_error(self, source):
         def parse(source=source):
@@ -321,8 +323,42 @@ DOCTYPE html [
                 ])
 
 
+class HTMLParserTolerantTestCase(TestCaseBase):
+
+    def setUp(self):
+        self.collector = EventCollector(strict=False)
+
+    def test_tolerant_parsing(self):
+        self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
+                        '<img src="URL><//img></html</html>', [
+                             ('data', '<html '),
+                             ('starttag', 'html', []),
+                             ('data', 'te>>xt'),
+                             ('entityref', 'a'),
+                             ('data', '<<bc'),
+                             ('endtag', 'a'),
+                             ('endtag', 'html'),
+                             ('data', '\n<img src="URL><//img></html'),
+                             ('endtag', 'html')],
+                        collector = self.collector)
+
+    def test_comma_between_attributes(self):
+        self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
+                        'method="post">', [
+                            ('starttag', 'form',
+                                [('action', '/xxx.php?a=1&b=2&amp'),
+                                 ('method', 'post')])],
+                        collector = self.collector)
+
+    def test_weird_chars_in_unquoted_attribute_values(self):
+        self._run_check('<form action=bogus|&#()value>', [
+                            ('starttag', 'form',
+                                [('action', 'bogus|&#()value')])],
+                        collector = self.collector)
+
+
 def test_main():
-    support.run_unittest(HTMLParserTestCase)
+    support.run_unittest(HTMLParserTestCase, HTMLParserTolerantTestCase)
 
 
 if __name__ == "__main__":
diff --git a/Misc/NEWS b/Misc/NEWS
index 434ed23..7c5d8d1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -58,6 +58,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #1486713: HTMLParser now has an optional tolerant mode where it
+  tries to guess at the correct parsing of invalid html.
+
 - Issue #10554: Add context manager support to subprocess.Popen objects.
 
 - Issue #8989: email.utils.make_msgid now has a domain parameter that can
author	R. David Murray <rdmurray@bitdance.com>	2010-12-03 04:06:39 (GMT)
committer	R. David Murray <rdmurray@bitdance.com>	2010-12-03 04:06:39 (GMT)
commit	b579dba1195df97f87ba868a5987f18fb7509bff (patch)
tree	d1ff2cf38f061ee0bba08459167e33daa7a4ad79
parent	79cdb661f5a6cf8bba07aa50f4451f6c409bb067 (diff)
download	cpython-b579dba1195df97f87ba868a5987f18fb7509bff.zip cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.gz cpython-b579dba1195df97f87ba868a5987f18fb7509bff.tar.bz2