diff options
author | Fred Drake <fdrake@acm.org> | 2001-09-04 16:26:03 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2001-09-04 16:26:03 (GMT) |
commit | 7cf613dc77302fb9a2a6533878aba7296276e12c (patch) | |
tree | 4b0a537f66e0e65a3750bac2156545895b924050 | |
parent | a0ca3d611e0abc503da85d999069803fe8bed7a1 (diff) | |
download | cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.zip cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.gz cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.bz2 |
HTMLParser is allowed to be more strict than sgmllib, so let's not
change their basic behavior: When parsing something that cannot possibly
be valid in either HTML or XHTML, raise an exception.
-rw-r--r-- | Lib/HTMLParser.py | 47 | ||||
-rwxr-xr-x | Lib/test/test_htmlparser.py | 7 |
2 files changed, 17 insertions, 37 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 584046d..df8383e 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -269,17 +269,18 @@ class HTMLParser: return -1 # in practice, this should look like: ((name|stringlit) S*)+ '>' n = len(rawdata) - decltype = None - extrachars = "" + decltype, j = self.scan_name(j, i) + if j < 0: + return j + if decltype.lower() != "doctype": + raise HTMLParseError("unknown declaration: '%s'" % decltype, + self.getpos()) while j < n: c = rawdata[j] if c == ">": # end of declaration syntax data = rawdata[i+2:j] - if decltype == "doctype": - self.handle_decl(data) - else: - self.unknown_decl(data) + self.handle_decl(data) return j + 1 if c in "\"'": m = declstringlit.match(rawdata, j) @@ -287,30 +288,15 @@ class HTMLParser: return -1 # incomplete j = m.end() elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": - m = declname.match(rawdata, j) - if not m: - return -1 # incomplete - j = m.end() - if decltype is None: - decltype = m.group(0).rstrip().lower() - if decltype != "doctype": - extrachars = "=" + name, j = self.scan_name(j, i) elif c == "[" and decltype == "doctype": j = self.parse_doctype_subset(j + 1, i) - if j < 0: - return j - elif c in extrachars: - j = j + 1 - while j < n and rawdata[j] in string.whitespace: - j = j + 1 - if j == n: - # end of buffer while in declaration - return -1 else: raise HTMLParseError( "unexpected char in declaration: %s" % `rawdata[j]`, self.getpos()) - decltype = decltype or '' + if j < 0: + return j return -1 # incomplete # Internal -- scan past the internal subset in a <!DOCTYPE declaration, @@ -359,11 +345,9 @@ class HTMLParser: if (j + 1) == n: # end of buffer; incomplete return -1 - m = declname.match(rawdata, j + 1) - s = m.group() - if s == rawdata[j+1:]: - return -1 - j = j + 1 + len(s.rstrip()) + s, j = self.scan_name(j + 1, declstartpos) + if j < 0: + return j if rawdata[j] == ";": j = j + 1 elif c == "]": @@ -383,8 +367,9 @@ class HTMLParser: j = j + 1 else: self.updatepos(declstartpos, j) - raise HTMLParseError("unexpected char in internal subset", - self.getpos()) + raise HTMLParseError( + "unexpected char %s in internal subset" % `c`, + self.getpos()) # end of buffer reached return -1 diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 4e8e73c..8661066 100755 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -203,12 +203,7 @@ DOCTYPE html [ ]) def test_illegal_declarations(self): - s = 'abc<!spacer type="block" height="25">def' - self._run_check(s, [ - ("data", "abc"), - ("unknown decl", 'spacer type="block" height="25"'), - ("data", "def"), - ]) + self._parse_error('<!spacer type="block" height="25">') def test_starttag_end_boundary(self): self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) |