diff options
-rw-r--r-- | Lib/HTMLParser.py | 36 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 58 | ||||
-rw-r--r-- | Misc/NEWS | 2 |
3 files changed, 74 insertions, 22 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 1c6989e..516bc70 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -160,7 +160,7 @@ class HTMLParser(markupbase.ParserBase): elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): - k = self.parse_declaration(i) + k = self.parse_html_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 @@ -218,6 +218,40 @@ class HTMLParser(markupbase.ParserBase): i = self.updatepos(i, n) self.rawdata = rawdata[i:] + # Internal -- parse html declarations, return length or -1 if not terminated + # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state + # See also parse_declaration in _markupbase + def parse_html_declaration(self, i): + rawdata = self.rawdata + if rawdata[i:i+2] != '<!': + self.error('unexpected call to parse_html_declaration()') + if rawdata[i:i+4] == '<!--': + return self.parse_comment(i) + elif rawdata[i:i+3] == '<![': + return self.parse_marked_section(i) + elif rawdata[i:i+9].lower() == '<!doctype': + # find the closing > + gtpos = rawdata.find('>', 9) + if gtpos == -1: + return -1 + self.handle_decl(rawdata[i+2:gtpos]) + return gtpos+1 + else: + return self.parse_bogus_comment(i) + + # Internal -- parse bogus comment, return length or -1 if not terminated + # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + def parse_bogus_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+2] != '<!': + self.error('unexpected call to parse_comment()') + pos = rawdata.find('>', i+2) + if pos == -1: + return -1 + if report: + self.handle_comment(rawdata[i+2:pos]) + return pos + 1 + # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 14ed80c..29a721c 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -114,7 +114,7 @@ comment1b--> <Img sRc='Bar' isMAP>sample text “ -<!--comment2a-- --comment2b--><!> +<!--comment2a-- --comment2b--> </Html> """, [ ("data", "\n"), @@ -142,24 +142,6 @@ text ("data", " foo"), ]) - def test_doctype_decl(self): - inside = """\ -DOCTYPE html [ - <!ELEMENT html - O EMPTY> - <!ATTLIST html - version CDATA #IMPLIED - profile CDATA 'DublinCore'> - <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'> - <!ENTITY myEntity 'internal parsed entity'> - <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'> - <!ENTITY % paramEntity 'name|name|name'> - %paramEntity; - <!-- comment --> -]""" - self._run_check("<!%s>" % inside, [ - ("decl", inside), - ]) - def test_bad_nesting(self): # Strangely, this *is* supposed to test that overlapping # elements are allowed. HTMLParser is more geared toward @@ -182,7 +164,8 @@ DOCTYPE html [ ]) def test_illegal_declarations(self): - self._parse_error('<!spacer type="block" height="25">') + self._run_check('<!spacer type="block" height="25">', + [('comment', 'spacer type="block" height="25"')]) def test_starttag_end_boundary(self): self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) @@ -233,7 +216,7 @@ DOCTYPE html [ self._parse_error("<a foo='>") def test_declaration_junk_chars(self): - self._parse_error("<!DOCTYPE foo $ >") + self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')]) def test_startendtag(self): self._run_check("<p/>", [ @@ -449,6 +432,39 @@ class AttributesTestCase(TestCaseBase): [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) + def test_comments(self): + html = ("<!-- I'm a valid comment -->" + '<!--me too!-->' + '<!------>' + '<!---->' + '<!----I have many hyphens---->' + '<!-- I have a > in the middle -->' + '<!-- and I have -- in the middle! -->') + expected = [('comment', " I'm a valid comment "), + ('comment', 'me too!'), + ('comment', '--'), + ('comment', ''), + ('comment', '--I have many hyphens--'), + ('comment', ' I have a > in the middle '), + ('comment', ' and I have -- in the middle! ')] + self._run_check(html, expected) + + def test_broken_comments(self): + html = ('<! not really a comment >' + '<! not a comment either -->' + '<! -- close enough -->' + '<!><!<-- this was an empty comment>' + '<!!! another bogus comment !!!>') + expected = [ + ('comment', ' not really a comment '), + ('comment', ' not a comment either --'), + ('comment', ' -- close enough --'), + ('comment', ''), + ('comment', '<-- this was an empty comment'), + ('comment', '!! another bogus comment !!!'), + ] + self._run_check(html, expected) + def test_condcoms(self): html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->' '<!--[if IE 8]>condcoms<![endif]-->' @@ -90,6 +90,8 @@ Core and Builtins Library ------- +- Issue #13960: HTMLParser is now able to handle broken comments. + - Issue #9750: Fix sqlite3.Connection.iterdump on tables and fields with a name that is a keyword or contains quotes. Patch by Marko Kohtala. |