summaryrefslogtreecommitdiffstats
path: root/Lib/HTMLParser.py
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2001-09-24 20:10:28 (GMT)
committerFred Drake <fdrake@acm.org>2001-09-24 20:10:28 (GMT)
commitbfc8fea1e0c46bc0a337237c32b8c1a32985c144 (patch)
treecb7cdb2a70027f78c350125d3593249df7adb548 /Lib/HTMLParser.py
parent1cffd5ccff4f4fed205d9257f279f954ee127685 (diff)
downloadcpython-bfc8fea1e0c46bc0a337237c32b8c1a32985c144.zip
cpython-bfc8fea1e0c46bc0a337237c32b8c1a32985c144.tar.gz
cpython-bfc8fea1e0c46bc0a337237c32b8c1a32985c144.tar.bz2
Re-factor the HTMLParser class to use the new markupbase.ParserBase class.
Use a new internal method, error(), consistently to raise parse errors; the new base class also uses this.
Diffstat (limited to 'Lib/HTMLParser.py')
-rw-r--r--Lib/HTMLParser.py324
1 files changed, 19 insertions, 305 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index f54e3d6..08c53b3 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -8,6 +8,7 @@
# and CDATA (character data -- only end tags are special).
+import markupbase
import re
import string
@@ -21,12 +22,8 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
starttagopen = re.compile('<[a-zA-Z]')
-piopen = re.compile(r'<\?')
piclose = re.compile('>')
endtagopen = re.compile('</')
-declopen = re.compile('<!')
-special = re.compile('<![^<>]*>')
-commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
attrfind = re.compile(
@@ -47,13 +44,9 @@ locatestarttagend = re.compile(r"""
)*
\s* # trailing whitespace
""", re.VERBOSE)
-endstarttag = re.compile(r"\s*/?>")
endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
-declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
-declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
-
class HTMLParseError(Exception):
"""Exception raised for all parse errors."""
@@ -73,7 +66,7 @@ class HTMLParseError(Exception):
return result
-class HTMLParser:
+class HTMLParser(markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
Usage:
@@ -105,9 +98,8 @@ class HTMLParser:
self.rawdata = ''
self.stack = []
self.lasttag = '???'
- self.lineno = 1
- self.offset = 0
self.interesting = interesting_normal
+ markupbase.ParserBase.reset(self)
def feed(self, data):
"""Feed data to the parser.
@@ -122,26 +114,8 @@ class HTMLParser:
"""Handle any buffered data."""
self.goahead(1)
- # Internal -- update line number and offset. This should be
- # called for each piece of data exactly once, in order -- in other
- # words the concatenation of all the input strings to this
- # function should be exactly the entire input.
- def updatepos(self, i, j):
- if i >= j:
- return j
- rawdata = self.rawdata
- nlines = string.count(rawdata, "\n", i, j)
- if nlines:
- self.lineno = self.lineno + nlines
- pos = string.rindex(rawdata, "\n", i, j) # Should not fail
- self.offset = j-(pos+1)
- else:
- self.offset = self.offset + j-i
- return j
-
- def getpos(self):
- """Return current line number and offset."""
- return self.lineno, self.offset
+ def error(self, message):
+ raise HTMLParseError(message, self.getpos())
__starttag_text = None
@@ -178,11 +152,11 @@ class HTMLParser:
k = self.parse_endtag(i)
if k >= 0:
self.clear_cdata_mode()
- elif commentopen.match(rawdata, i): # <!--
+ elif rawdata.startswith("<!--", i): # <!--
k = self.parse_comment(i)
- elif piopen.match(rawdata, i): # <?
+ elif rawdata.startswith("<?", i): # <?
k = self.parse_pi(i)
- elif declopen.match(rawdata, i): # <!
+ elif rawdata.startswith("<!", i): # <!
k = self.parse_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
@@ -191,8 +165,7 @@ class HTMLParser:
break
if k < 0:
if end:
- raise HTMLParseError("EOF in middle of construct",
- self.getpos())
+ self.error("EOF in middle of construct")
break
i = self.updatepos(i, k)
elif rawdata[i:i+2] == "&#":
@@ -222,9 +195,7 @@ class HTMLParser:
# match.group() will contain at least 2 chars
rest = rawdata[i:]
if end and match.group() == rest:
- raise HTMLParseError(
- "EOF in middle of entity or char ref",
- self.getpos())
+ self.error("EOF in middle of entity or char ref")
# incomplete
break
elif (i + 1) < n:
@@ -255,263 +226,6 @@ class HTMLParser:
j = match.end()
return j
- # Internal -- parse declaration.
- def parse_declaration(self, i):
- # This is some sort of declaration; in "HTML as
- # deployed," this should only be the document type
- # declaration ("<!DOCTYPE html...>").
- rawdata = self.rawdata
- j = i + 2
- assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
- if rawdata[j:j+1] in ("-", ""):
- # Start of comment followed by buffer boundary,
- # or just a buffer boundary.
- return -1
- # in practice, this should look like: ((name|stringlit) S*)+ '>'
- n = len(rawdata)
- decltype, j = self.scan_name(j, i)
- if j < 0:
- return j
- if decltype.lower() != "doctype":
- raise HTMLParseError("unknown declaration: '%s'" % decltype,
- self.getpos())
- while j < n:
- c = rawdata[j]
- if c == ">":
- # end of declaration syntax
- data = rawdata[i+2:j]
- self.handle_decl(data)
- return j + 1
- if c in "\"'":
- m = declstringlit.match(rawdata, j)
- if not m:
- return -1 # incomplete
- j = m.end()
- elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
- name, j = self.scan_name(j, i)
- elif c == "[" and decltype == "doctype":
- j = self.parse_doctype_subset(j + 1, i)
- else:
- raise HTMLParseError(
- "unexpected char in declaration: %s" % `rawdata[j]`,
- self.getpos())
- if j < 0:
- return j
- return -1 # incomplete
-
- # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
- # returning the index just past any whitespace following the trailing ']'.
- def parse_doctype_subset(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- j = i
- while j < n:
- c = rawdata[j]
- if c == "<":
- s = rawdata[j:j+2]
- if s == "<":
- # end of buffer; incomplete
- return -1
- if s != "<!":
- self.updatepos(declstartpos, j + 1)
- raise HTMLParseError("unexpect char in internal subset",
- self.getpos())
- if (j + 2) == n:
- # end of buffer; incomplete
- return -1
- if (j + 4) > n:
- # end of buffer; incomplete
- return -1
- if rawdata[j:j+4] == "<!--":
- j = self.parse_comment(j, report=0)
- if j < 0:
- return j
- continue
- name, j = self.scan_name(j + 2, declstartpos)
- if j == -1:
- return -1
- if name not in ("attlist", "element", "entity", "notation"):
- self.updatepos(declstartpos, j + 2)
- raise HTMLParseError(
- "unknown declaration %s in internal subset" % `name`,
- self.getpos())
- # handle the individual names
- meth = getattr(self, "parse_doctype_" + name)
- j = meth(j, declstartpos)
- if j < 0:
- return j
- elif c == "%":
- # parameter entity reference
- if (j + 1) == n:
- # end of buffer; incomplete
- return -1
- s, j = self.scan_name(j + 1, declstartpos)
- if j < 0:
- return j
- if rawdata[j] == ";":
- j = j + 1
- elif c == "]":
- j = j + 1
- while j < n and rawdata[j] in string.whitespace:
- j = j + 1
- if j < n:
- if rawdata[j] == ">":
- return j
- self.updatepos(declstartpos, j)
- raise HTMLParseError(
- "unexpected char after internal subset",
- self.getpos())
- else:
- return -1
- elif c in string.whitespace:
- j = j + 1
- else:
- self.updatepos(declstartpos, j)
- raise HTMLParseError(
- "unexpected char %s in internal subset" % `c`,
- self.getpos())
- # end of buffer reached
- return -1
-
- def parse_doctype_element(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- name, j = self.scan_name(i, declstartpos)
- if j == -1:
- return -1
- # style content model; just skip until '>'
- if '>' in rawdata[j:]:
- return string.find(rawdata, ">", j) + 1
- return -1
-
- def parse_doctype_attlist(self, i, declstartpos):
- rawdata = self.rawdata
- name, j = self.scan_name(i, declstartpos)
- c = rawdata[j:j+1]
- if c == "":
- return -1
- if c == ">":
- return j + 1
- while 1:
- # scan a series of attribute descriptions; simplified:
- # name type [value] [#constraint]
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
- c = rawdata[j:j+1]
- if c == "":
- return -1
- if c == "(":
- # an enumerated type; look for ')'
- if ")" in rawdata[j:]:
- j = string.find(rawdata, ")", j) + 1
- else:
- return -1
- while rawdata[j:j+1] in string.whitespace:
- j = j + 1
- if not rawdata[j:]:
- # end of buffer, incomplete
- return -1
- else:
- name, j = self.scan_name(j, declstartpos)
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c in "'\"":
- m = declstringlit.match(rawdata, j)
- if m:
- j = m.end()
- else:
- return -1
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c == "#":
- if rawdata[j:] == "#":
- # end of buffer
- return -1
- name, j = self.scan_name(j + 1, declstartpos)
- if j < 0:
- return j
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c == '>':
- # all done
- return j + 1
-
- def parse_doctype_notation(self, i, declstartpos):
- name, j = self.scan_name(i, declstartpos)
- if j < 0:
- return j
- rawdata = self.rawdata
- while 1:
- c = rawdata[j:j+1]
- if not c:
- # end of buffer; incomplete
- return -1
- if c == '>':
- return j + 1
- if c in "'\"":
- m = declstringlit.match(rawdata, j)
- if not m:
- return -1
- j = m.end()
- else:
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
-
- def parse_doctype_entity(self, i, declstartpos):
- rawdata = self.rawdata
- if rawdata[i:i+1] == "%":
- j = i + 1
- while 1:
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c in string.whitespace:
- j = j + 1
- else:
- break
- else:
- j = i
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
- while 1:
- c = self.rawdata[j:j+1]
- if not c:
- return -1
- if c in "'\"":
- m = declstringlit.match(rawdata, j)
- if m:
- j = m.end()
- else:
- return -1 # incomplete
- elif c == ">":
- return j + 1
- else:
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
-
- def scan_name(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- if i == n:
- return None, -1
- m = declname.match(rawdata, i)
- if m:
- s = m.group()
- name = s.strip()
- if (i + len(s)) == n:
- return None, -1 # end of buffer
- return name.lower(), m.end()
- else:
- self.updatepos(declstartpos, i)
- raise HTMLParseError("expected name token", self.getpos())
-
# Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
@@ -563,9 +277,8 @@ class HTMLParser:
- string.rfind(self.__starttag_text, "\n")
else:
offset = offset + len(self.__starttag_text)
- raise HTMLParseError("junk characters in start tag: %s"
- % `rawdata[k:endpos][:20]`,
- (lineno, offset))
+ self.error("junk characters in start tag: %s"
+ % `rawdata[k:endpos][:20]`)
if end[-2:] == '/>':
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
@@ -594,8 +307,7 @@ class HTMLParser:
return -1
# else bogus input
self.updatepos(i, j + 1)
- raise HTMLParseError("malformed empty start tag",
- self.getpos())
+ self.error("malformed empty start tag")
if next == "":
# end of input
return -1
@@ -605,8 +317,8 @@ class HTMLParser:
# '/' from a '/>' ending
return -1
self.updatepos(i, j)
- raise HTMLParseError("malformed start tag", self.getpos())
- raise AssertionError("we should not gt here!")
+ self.error("malformed start tag")
+ raise AssertionError("we should not get here!")
# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
@@ -618,8 +330,7 @@ class HTMLParser:
j = match.end()
match = endtagfind.match(rawdata, i) # </ + tag + >
if not match:
- raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
- self.getpos())
+ self.error("bad end tag: %s" % `rawdata[i:j]`)
tag = match.group(1)
self.handle_endtag(string.lower(tag))
return j
@@ -661,6 +372,9 @@ class HTMLParser:
def handle_pi(self, data):
pass
+ def unknown_decl(self, data):
+ self.error("unknown declaration: " + `data`)
+
# Internal -- helper to remove special character quoting
def unescape(self, s):
if '&' not in s: