HTMLParser is allowed to be more strict than sgmllib, so let's not

change their basic behavior: When parsing something that cannot possibly be valid in either HTML or XHTML, raise an exception.
author: Fred Drake <fdrake@acm.org> 2001-09-04 16:26:03 (GMT)
committer: Fred Drake <fdrake@acm.org> 2001-09-04 16:26:03 (GMT)
commit: 7cf613dc77302fb9a2a6533878aba7296276e12c (patch)
tree: 4b0a537f66e0e65a3750bac2156545895b924050
parent: a0ca3d611e0abc503da85d999069803fe8bed7a1 (diff)
download: cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.zip
cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.gz
cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.bz2
2 files changed, 17 insertions, 37 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 584046d..df8383e 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -269,17 +269,18 @@ class HTMLParser:
             return -1
         # in practice, this should look like: ((name|stringlit) S*)+ '>'
         n = len(rawdata)
-        decltype = None
-        extrachars = ""
+        decltype, j = self.scan_name(j, i)
+        if j < 0:
+            return j
+        if decltype.lower() != "doctype":
+            raise HTMLParseError("unknown declaration: '%s'" % decltype,
+                                 self.getpos())
         while j < n:
             c = rawdata[j]
             if c == ">":
                 # end of declaration syntax
                 data = rawdata[i+2:j]
-                if decltype == "doctype":
-                    self.handle_decl(data)
-                else:
-                    self.unknown_decl(data)
+                self.handle_decl(data)
                 return j + 1
             if c in "\"'":
                 m = declstringlit.match(rawdata, j)
@@ -287,30 +288,15 @@ class HTMLParser:
                     return -1 # incomplete
                 j = m.end()
             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
-                m = declname.match(rawdata, j)
-                if not m:
-                    return -1 # incomplete
-                j = m.end()
-                if decltype is None:
-                    decltype = m.group(0).rstrip().lower()
-                    if decltype != "doctype":
-                        extrachars = "="
+                name, j = self.scan_name(j, i)
             elif c == "[" and decltype == "doctype":
                 j = self.parse_doctype_subset(j + 1, i)
-                if j < 0:
-                    return j
-            elif c in extrachars:
-                j = j + 1
-                while j < n and rawdata[j] in string.whitespace:
-                    j = j + 1
-                if j == n:
-                    # end of buffer while in declaration
-                    return -1
             else:
                 raise HTMLParseError(
                     "unexpected char in declaration: %s" % `rawdata[j]`,
                     self.getpos())
-            decltype = decltype or ''
+            if j < 0:
+                return j
         return -1 # incomplete
 
     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
@@ -359,11 +345,9 @@ class HTMLParser:
                 if (j + 1) == n:
                     # end of buffer; incomplete
                     return -1
-                m = declname.match(rawdata, j + 1)
-                s = m.group()
-                if s == rawdata[j+1:]:
-                    return -1
-                j = j + 1 + len(s.rstrip())
+                s, j = self.scan_name(j + 1, declstartpos)
+                if j < 0:
+                    return j
                 if rawdata[j] == ";":
                     j = j + 1
             elif c == "]":
@@ -383,8 +367,9 @@ class HTMLParser:
                 j = j + 1
             else:
                 self.updatepos(declstartpos, j)
-                raise HTMLParseError("unexpected char in internal subset",
-                                     self.getpos())
+                raise HTMLParseError(
+                    "unexpected char %s in internal subset" % `c`,
+                    self.getpos())
         # end of buffer reached
         return -1
 
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 4e8e73c..8661066 100755
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -203,12 +203,7 @@ DOCTYPE html [
             ])
 
     def test_illegal_declarations(self):
-        s = 'abc<!spacer type="block" height="25">def'
-        self._run_check(s, [
-            ("data", "abc"),
-            ("unknown decl", 'spacer type="block" height="25"'),
-            ("data", "def"),
-            ])
+        self._parse_error('<!spacer type="block" height="25">')
 
     def test_starttag_end_boundary(self):
         self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
author	Fred Drake <fdrake@acm.org>	2001-09-04 16:26:03 (GMT)
committer	Fred Drake <fdrake@acm.org>	2001-09-04 16:26:03 (GMT)
commit	7cf613dc77302fb9a2a6533878aba7296276e12c (patch)
tree	4b0a537f66e0e65a3750bac2156545895b924050
parent	a0ca3d611e0abc503da85d999069803fe8bed7a1 (diff)
download	cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.zip cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.gz cpython-7cf613dc77302fb9a2a6533878aba7296276e12c.tar.bz2