summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/html/parser.py25
-rw-r--r--Lib/test/test_htmlparser.py30
-rw-r--r--Misc/NEWS5
3 files changed, 58 insertions, 2 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index dd9c2e1..5c4a7ef 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase):
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
- k = self.parse_declaration(i)
+ # this might fail with things like <! not a comment > or
+ # <! -- space before '--' -->. When strict is True an
+ # error is raised, when it's False they will be considered
+ # as bogus comments and parsed (see parse_bogus_comment).
+ if self.strict:
+ k = self.parse_declaration(i)
+ else:
+ try:
+ k = self.parse_declaration(i)
+ except HTMLParseError:
+ k = self.parse_bogus_comment(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
@@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase):
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
+ # Internal -- parse bogus comment, return length or -1 if not terminated
+ # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
+ def parse_bogus_comment(self, i, report=1):
+ rawdata = self.rawdata
+ if rawdata[i:i+2] != '<!':
+ self.error('unexpected call to parse_comment()')
+ pos = rawdata.find('>', i+2)
+ if pos == -1:
+ return -1
+ if report:
+ self.handle_comment(rawdata[i+2:pos])
+ return pos + 1
+
# Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 8c2e25e..7af9131 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -323,6 +323,23 @@ DOCTYPE html [
("endtag", element_lower)],
collector=Collector())
+ def test_comments(self):
+ html = ("<!-- I'm a valid comment -->"
+ '<!--me too!-->'
+ '<!------>'
+ '<!---->'
+ '<!----I have many hyphens---->'
+ '<!-- I have a > in the middle -->'
+ '<!-- and I have -- in the middle! -->')
+ expected = [('comment', " I'm a valid comment "),
+ ('comment', 'me too!'),
+ ('comment', '--'),
+ ('comment', ''),
+ ('comment', '--I have many hyphens--'),
+ ('comment', ' I have a > in the middle '),
+ ('comment', ' and I have -- in the middle! ')]
+ self._run_check(html, expected)
+
def test_condcoms(self):
html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
'<!--[if IE 8]>condcoms<![endif]-->'
@@ -426,6 +443,19 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
# see #12888
self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
+ def test_broken_comments(self):
+ html = ('<! not really a comment >'
+ '<! not a comment either -->'
+ '<! -- close enough -->'
+ '<!!! another bogus comment !!!>')
+ expected = [
+ ('comment', ' not really a comment '),
+ ('comment', ' not a comment either --'),
+ ('comment', ' -- close enough --'),
+ ('comment', '!! another bogus comment !!!'),
+ ]
+ self._run_check(html, expected)
+
def test_broken_condcoms(self):
# these condcoms are missing the '--' after '<!' and before the '>'
html = ('<![if !(IE)]>broken condcom<![endif]>'
diff --git a/Misc/NEWS b/Misc/NEWS
index 487b46f..d1f9ab0 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -113,6 +113,9 @@ Core and Builtins
Library
-------
+- Issue #13960: HTMLParser is now able to handle broken comments when
+ strict=False.
+
- Issue #9021: Add an introduction to the copy module documentation.
- Issue #6005: Examples in the socket library documentation use sendall, where
@@ -123,7 +126,7 @@ Library
- Issue #10881: Fix test_site failure with OS X framework builds.
-- Issue #964437 Make IDLE help window non-modal.
+- Issue #964437: Make IDLE help window non-modal.
Patch by Guilherme Polo and Roger Serwy.
- Issue #2945: Make the distutils upload command aware of bdist_rpm products.