diff options
author | Fred Drake <fdrake@acm.org> | 2001-08-20 21:24:19 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2001-08-20 21:24:19 (GMT) |
commit | 029acfb922bdd25d6e38c864895c6cc66db76d13 (patch) | |
tree | e46ebe60a3cd9f0f3c20436ad226cf989dcb1b03 | |
parent | 18da1e1e7f30d0612e7a36a369e1d422dd50ef41 (diff) | |
download | cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.zip cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.gz cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.bz2 |
Deal more appropriately with bare ampersands and pointy brackets; this
module has to deal with "class" HTML-as-deployed as well as XHTML, so we
cannot be as strict as XHTML allows.
This closes SF bug #453059, but uses a different fix than suggested in
the bug comments.
-rw-r--r-- | Lib/HTMLParser.py | 24 | ||||
-rwxr-xr-x | Lib/test/test_htmlparser.py | 34 |
2 files changed, 39 insertions, 19 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 39a5d82..954ce26 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -15,7 +15,8 @@ import string interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\Z)') -incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?') +incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*' + '|#([0-9]*|[xX][0-9a-fA-F]*))?') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') @@ -185,11 +186,8 @@ class HTMLParser: elif declopen.match(rawdata, i): # <! k = self.parse_declaration(i) else: - if i < n-1: - raise HTMLParseError( - "invalid '<' construct: %s" % `rawdata[i:i+2]`, - self.getpos()) - k = -1 + self.handle_data("<") + k = i + 1 if k < 0: if end: raise HTMLParseError("EOF in middle of construct", @@ -203,7 +201,7 @@ class HTMLParser: self.handle_charref(name) k = match.end() if rawdata[k-1] != ';': - k = k-1 + k = k - 1 i = self.updatepos(i, k) continue match = entityref.match(rawdata, i) @@ -212,17 +210,19 @@ class HTMLParser: self.handle_entityref(name) k = match.end() if rawdata[k-1] != ';': - k = k-1 + k = k - 1 i = self.updatepos(i, k) continue - if incomplete.match(rawdata, i): - if end: + match = incomplete.match(rawdata, i) + if match: + rest = rawdata[i:] + if end and rest != "&" and match.group() == rest: raise HTMLParseError( "EOF in middle of entity or char ref", self.getpos()) return -1 # incomplete - raise HTMLParseError("'&' not part of entity or char ref", - self.getpos()) + self.handle_data("&") + i = self.updatepos(i, i + 1) else: assert 0, "interesting.search() lied" # end while diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index e0e212c..bb6e0b0 100755 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -1,6 +1,7 @@ """Tests for HTMLParser.py.""" import HTMLParser +import pprint import sys import test_support import unittest @@ -83,9 +84,10 @@ class TestCaseBase(unittest.TestCase): for c in self.epilogue: parser.feed(c) parser.close() - self.assert_(parser.get_events() == - self.initial_events + events + self.final_events, - parser.get_events()) + events = parser.get_events() + self.assertEqual(events, + self.initial_events + events + self.final_events, + "got events:\n" + pprint.pformat(events)) def _run_check_extra(self, source, events): self._run_check(source, events, EventCollectorExtra) @@ -137,6 +139,18 @@ text ("data", "\n"), ]) + def test_doctype_decl(self): + inside = """\ +DOCTYPE html [ + <!ELEMENT html - O EMPTY> + <!ATTLIST html + version CDATA #IMPLIED '4.0'> + <!-- comment --> +]""" + self._run_check("<!%s>" % inside, [ + ("decl", inside), + ]) + def test_bad_nesting(self): # Strangely, this *is* supposed to test that overlapping # elements are allowed. HTMLParser is more geared toward @@ -148,6 +162,16 @@ text ("endtag", "b"), ]) + def test_bare_ampersands(self): + self._run_check("this text & contains & ampersands &", [ + ("data", "this text & contains & ampersands &"), + ]) + + def test_bare_pointy_brackets(self): + self._run_check("this < text > contains < bare>pointy< brackets", [ + ("data", "this < text > contains < bare>pointy< brackets"), + ]) + def test_attr_syntax(self): output = [ ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) @@ -199,16 +223,12 @@ text self._run_check(["<a b='>'", ">"], output) def test_starttag_junk_chars(self): - self._parse_error("<") - self._parse_error("<>") self._parse_error("</>") self._parse_error("</$>") self._parse_error("</") self._parse_error("</a") self._parse_error("<a<a>") self._parse_error("</a<a>") - self._parse_error("<$") - self._parse_error("<$>") self._parse_error("<!") self._parse_error("<a $>") self._parse_error("<a") |