Deal more appropriately with bare ampersands and pointy brackets; this

module has to deal with "class" HTML-as-deployed as well as XHTML, so we cannot be as strict as XHTML allows. This closes SF bug #453059, but uses a different fix than suggested in the bug comments.
author: Fred Drake <fdrake@acm.org> 2001-08-20 21:24:19 (GMT)
committer: Fred Drake <fdrake@acm.org> 2001-08-20 21:24:19 (GMT)
commit: 029acfb922bdd25d6e38c864895c6cc66db76d13 (patch)
tree: e46ebe60a3cd9f0f3c20436ad226cf989dcb1b03 /Lib/HTMLParser.py
parent: 18da1e1e7f30d0612e7a36a369e1d422dd50ef41 (diff)
download: cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.zip
cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.gz
cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.bz2
1 files changed, 12 insertions, 12 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 39a5d82..954ce26 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -15,7 +15,8 @@ import string
 
 interesting_normal = re.compile('[&<]')
 interesting_cdata = re.compile(r'<(/|\Z)')
-incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
+incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
+                        '|#([0-9]*|[xX][0-9a-fA-F]*))?')
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
@@ -185,11 +186,8 @@ class HTMLParser:
                 elif declopen.match(rawdata, i): # <!
                     k = self.parse_declaration(i)
                 else:
-                    if i < n-1:
-                        raise HTMLParseError(
-                            "invalid '<' construct: %s" % `rawdata[i:i+2]`,
-                            self.getpos())
-                    k = -1
+                    self.handle_data("<")
+                    k = i + 1
                 if k < 0:
                     if end:
                         raise HTMLParseError("EOF in middle of construct",
@@ -203,7 +201,7 @@ class HTMLParser:
                     self.handle_charref(name)
                     k = match.end()
                     if rawdata[k-1] != ';':
-                        k = k-1
+                        k = k - 1
                     i = self.updatepos(i, k)
                     continue
                 match = entityref.match(rawdata, i)
@@ -212,17 +210,19 @@ class HTMLParser:
                     self.handle_entityref(name)
                     k = match.end()
                     if rawdata[k-1] != ';':
-                        k = k-1
+                        k = k - 1
                     i = self.updatepos(i, k)
                     continue
-                if incomplete.match(rawdata, i):
-                    if end:
+                match = incomplete.match(rawdata, i)
+                if match:
+                    rest = rawdata[i:]
+                    if end and rest != "&" and match.group() == rest:
                         raise HTMLParseError(
                             "EOF in middle of entity or char ref",
                             self.getpos())
                     return -1 # incomplete
-                raise HTMLParseError("'&' not part of entity or char ref",
-                                     self.getpos())
+                self.handle_data("&")
+                i = self.updatepos(i, i + 1)
             else:
                 assert 0, "interesting.search() lied"
         # end while
author	Fred Drake <fdrake@acm.org>	2001-08-20 21:24:19 (GMT)
committer	Fred Drake <fdrake@acm.org>	2001-08-20 21:24:19 (GMT)
commit	029acfb922bdd25d6e38c864895c6cc66db76d13 (patch)
tree	e46ebe60a3cd9f0f3c20436ad226cf989dcb1b03 /Lib/HTMLParser.py
parent	18da1e1e7f30d0612e7a36a369e1d422dd50ef41 (diff)
download	cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.zip cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.gz cpython-029acfb922bdd25d6e38c864895c6cc66db76d13.tar.bz2