From 68eac2b574ed837099998d68ea592d02bd8ca129 Mon Sep 17 00:00:00 2001 From: Fred Drake Date: Tue, 4 Sep 2001 15:10:16 +0000 Subject: Added reasonable parsing of the DOCTYPE declaration, fixed edge cases regarding bare ampersands in content. --- Lib/HTMLParser.py | 272 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 260 insertions(+), 12 deletions(-) diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 954ce26..584046d 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -15,8 +15,7 @@ import string interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\Z)') -incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*' - '|#([0-9]*|[xX][0-9a-fA-F]*))?') +incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') @@ -185,16 +184,18 @@ class HTMLParser: k = self.parse_pi(i) elif declopen.match(rawdata, i): # ' n = len(rawdata) + decltype = None + extrachars = "" while j < n: c = rawdata[j] if c == ">": # end of declaration syntax - self.handle_decl(rawdata[i+2:j]) + data = rawdata[i+2:j] + if decltype == "doctype": + self.handle_decl(data) + else: + self.unknown_decl(data) return j + 1 if c in "\"'": m = declstringlit.match(rawdata, j) @@ -273,12 +291,242 @@ class HTMLParser: if not m: return -1 # incomplete j = m.end() + if decltype is None: + decltype = m.group(0).rstrip().lower() + if decltype != "doctype": + extrachars = "=" + elif c == "[" and decltype == "doctype": + j = self.parse_doctype_subset(j + 1, i) + if j < 0: + return j + elif c in extrachars: + j = j + 1 + while j < n and rawdata[j] in string.whitespace: + j = j + 1 + if j == n: + # end of buffer while in declaration + return -1 else: raise HTMLParseError( "unexpected char in declaration: %s" % `rawdata[j]`, self.getpos()) + decltype = decltype or '' return -1 # incomplete + # Internal -- scan past the internal subset in a n: + # end of buffer; incomplete + return -1 + if rawdata[j:j+4] == "