summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2001-08-20 21:24:19 (GMT)
committerFred Drake <fdrake@acm.org>2001-08-20 21:24:19 (GMT)
commit029acfb922bdd25d6e38c864895c6cc66db76d13 (patch)
treee46ebe60a3cd9f0f3c20436ad226cf989dcb1b03
parent18da1e1e7f30d0612e7a36a369e1d422dd50ef41 (diff)
downloadcpython-029acfb922bdd25d6e38c864895c6cc66db76d13.zip
cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.gz
cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.bz2
Deal more appropriately with bare ampersands and pointy brackets; this
module has to deal with "class" HTML-as-deployed as well as XHTML, so we cannot be as strict as XHTML allows. This closes SF bug #453059, but uses a different fix than suggested in the bug comments.
-rw-r--r--Lib/HTMLParser.py24
-rwxr-xr-xLib/test/test_htmlparser.py34
2 files changed, 39 insertions, 19 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 39a5d82..954ce26 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -15,7 +15,8 @@ import string
interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
-incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
+incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
+ '|#([0-9]*|[xX][0-9a-fA-F]*))?')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
@@ -185,11 +186,8 @@ class HTMLParser:
elif declopen.match(rawdata, i): # <!
k = self.parse_declaration(i)
else:
- if i < n-1:
- raise HTMLParseError(
- "invalid '<' construct: %s" % `rawdata[i:i+2]`,
- self.getpos())
- k = -1
+ self.handle_data("<")
+ k = i + 1
if k < 0:
if end:
raise HTMLParseError("EOF in middle of construct",
@@ -203,7 +201,7 @@ class HTMLParser:
self.handle_charref(name)
k = match.end()
if rawdata[k-1] != ';':
- k = k-1
+ k = k - 1
i = self.updatepos(i, k)
continue
match = entityref.match(rawdata, i)
@@ -212,17 +210,19 @@ class HTMLParser:
self.handle_entityref(name)
k = match.end()
if rawdata[k-1] != ';':
- k = k-1
+ k = k - 1
i = self.updatepos(i, k)
continue
- if incomplete.match(rawdata, i):
- if end:
+ match = incomplete.match(rawdata, i)
+ if match:
+ rest = rawdata[i:]
+ if end and rest != "&" and match.group() == rest:
raise HTMLParseError(
"EOF in middle of entity or char ref",
self.getpos())
return -1 # incomplete
- raise HTMLParseError("'&' not part of entity or char ref",
- self.getpos())
+ self.handle_data("&")
+ i = self.updatepos(i, i + 1)
else:
assert 0, "interesting.search() lied"
# end while
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index e0e212c..bb6e0b0 100755
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -1,6 +1,7 @@
"""Tests for HTMLParser.py."""
import HTMLParser
+import pprint
import sys
import test_support
import unittest
@@ -83,9 +84,10 @@ class TestCaseBase(unittest.TestCase):
for c in self.epilogue:
parser.feed(c)
parser.close()
- self.assert_(parser.get_events() ==
- self.initial_events + events + self.final_events,
- parser.get_events())
+ events = parser.get_events()
+ self.assertEqual(events,
+ self.initial_events + events + self.final_events,
+ "got events:\n" + pprint.pformat(events))
def _run_check_extra(self, source, events):
self._run_check(source, events, EventCollectorExtra)
@@ -137,6 +139,18 @@ text
("data", "\n"),
])
+ def test_doctype_decl(self):
+ inside = """\
+DOCTYPE html [
+ <!ELEMENT html - O EMPTY>
+ <!ATTLIST html
+ version CDATA #IMPLIED '4.0'>
+ <!-- comment -->
+]"""
+ self._run_check("<!%s>" % inside, [
+ ("decl", inside),
+ ])
+
def test_bad_nesting(self):
# Strangely, this *is* supposed to test that overlapping
# elements are allowed. HTMLParser is more geared toward
@@ -148,6 +162,16 @@ text
("endtag", "b"),
])
+ def test_bare_ampersands(self):
+ self._run_check("this text & contains & ampersands &", [
+ ("data", "this text & contains & ampersands &"),
+ ])
+
+ def test_bare_pointy_brackets(self):
+ self._run_check("this < text > contains < bare>pointy< brackets", [
+ ("data", "this < text > contains < bare>pointy< brackets"),
+ ])
+
def test_attr_syntax(self):
output = [
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
@@ -199,16 +223,12 @@ text
self._run_check(["<a b='>'", ">"], output)
def test_starttag_junk_chars(self):
- self._parse_error("<")
- self._parse_error("<>")
self._parse_error("</>")
self._parse_error("</$>")
self._parse_error("</")
self._parse_error("</a")
self._parse_error("<a<a>")
self._parse_error("</a<a>")
- self._parse_error("<$")
- self._parse_error("<$>")
self._parse_error("<!")
self._parse_error("<a $>")
self._parse_error("<a")