From 029acfb922bdd25d6e38c864895c6cc66db76d13 Mon Sep 17 00:00:00 2001 From: Fred Drake Date: Mon, 20 Aug 2001 21:24:19 +0000 Subject: Deal more appropriately with bare ampersands and pointy brackets; this module has to deal with "class" HTML-as-deployed as well as XHTML, so we cannot be as strict as XHTML allows. This closes SF bug #453059, but uses a different fix than suggested in the bug comments. --- Lib/HTMLParser.py | 24 ++++++++++++------------ Lib/test/test_htmlparser.py | 34 +++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 39a5d82..954ce26 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -15,7 +15,8 @@ import string interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\Z)') -incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?') +incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*' + '|#([0-9]*|[xX][0-9a-fA-F]*))?') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') @@ -185,11 +186,8 @@ class HTMLParser: elif declopen.match(rawdata, i): # + + +]""" + self._run_check("" % inside, [ + ("decl", inside), + ]) + def test_bad_nesting(self): # Strangely, this *is* supposed to test that overlapping # elements are allowed. HTMLParser is more geared toward @@ -148,6 +162,16 @@ text ("endtag", "b"), ]) + def test_bare_ampersands(self): + self._run_check("this text & contains & ampersands &", [ + ("data", "this text & contains & ampersands &"), + ]) + + def test_bare_pointy_brackets(self): + self._run_check("this < text > contains < bare>pointy< brackets", [ + ("data", "this < text > contains < bare>pointy< brackets"), + ]) + def test_attr_syntax(self): output = [ ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) @@ -199,16 +223,12 @@ text self._run_check([""], output) def test_starttag_junk_chars(self): - self._parse_error("<") - self._parse_error("<>") self._parse_error("") self._parse_error("") self._parse_error("") self._parse_error("") - self._parse_error("<$") - self._parse_error("<$>") self._parse_error("") self._parse_error("