summaryrefslogtreecommitdiffstats
path: root/Lib/html
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2012-02-10 08:45:44 (GMT)
committerEzio Melotti <ezio.melotti@gmail.com>2012-02-10 08:45:44 (GMT)
commitfa3702dc28fa8aef291785c560832c9af60305a8 (patch)
tree3f83918e3a492d96c097ea85df7fda26559ab43d /Lib/html
parent5b14d732d8790a6a19cc8aa410740575ff94c85a (diff)
downloadcpython-fa3702dc28fa8aef291785c560832c9af60305a8.zip
cpython-fa3702dc28fa8aef291785c560832c9af60305a8.tar.gz
cpython-fa3702dc28fa8aef291785c560832c9af60305a8.tar.bz2
#13960: HTMLParser is now able to handle broken comments when strict=False.
Diffstat (limited to 'Lib/html')
-rw-r--r--Lib/html/parser.py25
1 files changed, 24 insertions, 1 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index dd9c2e1..5c4a7ef 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase):
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
- k = self.parse_declaration(i)
+ # this might fail with things like <! not a comment > or
+ # <! -- space before '--' -->. When strict is True an
+ # error is raised, when it's False they will be considered
+ # as bogus comments and parsed (see parse_bogus_comment).
+ if self.strict:
+ k = self.parse_declaration(i)
+ else:
+ try:
+ k = self.parse_declaration(i)
+ except HTMLParseError:
+ k = self.parse_bogus_comment(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
@@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase):
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
+ # Internal -- parse bogus comment, return length or -1 if not terminated
+ # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
+ def parse_bogus_comment(self, i, report=1):
+ rawdata = self.rawdata
+ if rawdata[i:i+2] != '<!':
+ self.error('unexpected call to parse_comment()')
+ pos = rawdata.find('>', i+2)
+ if pos == -1:
+ return -1
+ if report:
+ self.handle_comment(rawdata[i+2:pos])
+ return pos + 1
+
# Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata