summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2001-09-04 16:26:03 (GMT)
committerFred Drake <fdrake@acm.org>2001-09-04 16:26:03 (GMT)
commit7cf613dc77302fb9a2a6533878aba7296276e12c (patch)
tree4b0a537f66e0e65a3750bac2156545895b924050
parenta0ca3d611e0abc503da85d999069803fe8bed7a1 (diff)
downloadcpython-7cf613dc77302fb9a2a6533878aba7296276e12c.zip
cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.gz
cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.bz2
HTMLParser is allowed to be more strict than sgmllib, so let's not
change their basic behavior: When parsing something that cannot possibly be valid in either HTML or XHTML, raise an exception.
-rw-r--r--Lib/HTMLParser.py47
-rwxr-xr-xLib/test/test_htmlparser.py7
2 files changed, 17 insertions, 37 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 584046d..df8383e 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -269,17 +269,18 @@ class HTMLParser:
return -1
# in practice, this should look like: ((name|stringlit) S*)+ '>'
n = len(rawdata)
- decltype = None
- extrachars = ""
+ decltype, j = self.scan_name(j, i)
+ if j < 0:
+ return j
+ if decltype.lower() != "doctype":
+ raise HTMLParseError("unknown declaration: '%s'" % decltype,
+ self.getpos())
while j < n:
c = rawdata[j]
if c == ">":
# end of declaration syntax
data = rawdata[i+2:j]
- if decltype == "doctype":
- self.handle_decl(data)
- else:
- self.unknown_decl(data)
+ self.handle_decl(data)
return j + 1
if c in "\"'":
m = declstringlit.match(rawdata, j)
@@ -287,30 +288,15 @@ class HTMLParser:
return -1 # incomplete
j = m.end()
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
- m = declname.match(rawdata, j)
- if not m:
- return -1 # incomplete
- j = m.end()
- if decltype is None:
- decltype = m.group(0).rstrip().lower()
- if decltype != "doctype":
- extrachars = "="
+ name, j = self.scan_name(j, i)
elif c == "[" and decltype == "doctype":
j = self.parse_doctype_subset(j + 1, i)
- if j < 0:
- return j
- elif c in extrachars:
- j = j + 1
- while j < n and rawdata[j] in string.whitespace:
- j = j + 1
- if j == n:
- # end of buffer while in declaration
- return -1
else:
raise HTMLParseError(
"unexpected char in declaration: %s" % `rawdata[j]`,
self.getpos())
- decltype = decltype or ''
+ if j < 0:
+ return j
return -1 # incomplete
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
@@ -359,11 +345,9 @@ class HTMLParser:
if (j + 1) == n:
# end of buffer; incomplete
return -1
- m = declname.match(rawdata, j + 1)
- s = m.group()
- if s == rawdata[j+1:]:
- return -1
- j = j + 1 + len(s.rstrip())
+ s, j = self.scan_name(j + 1, declstartpos)
+ if j < 0:
+ return j
if rawdata[j] == ";":
j = j + 1
elif c == "]":
@@ -383,8 +367,9 @@ class HTMLParser:
j = j + 1
else:
self.updatepos(declstartpos, j)
- raise HTMLParseError("unexpected char in internal subset",
- self.getpos())
+ raise HTMLParseError(
+ "unexpected char %s in internal subset" % `c`,
+ self.getpos())
# end of buffer reached
return -1
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 4e8e73c..8661066 100755
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -203,12 +203,7 @@ DOCTYPE html [
])
def test_illegal_declarations(self):
- s = 'abc<!spacer type="block" height="25">def'
- self._run_check(s, [
- ("data", "abc"),
- ("unknown decl", 'spacer type="block" height="25"'),
- ("data", "def"),
- ])
+ self._parse_error('<!spacer type="block" height="25">')
def test_starttag_end_boundary(self):
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])