diff options
author | Fred Drake <fdrake@acm.org> | 2001-09-24 20:10:28 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2001-09-24 20:10:28 (GMT) |
commit | bfc8fea1e0c46bc0a337237c32b8c1a32985c144 (patch) | |
tree | cb7cdb2a70027f78c350125d3593249df7adb548 /Lib/HTMLParser.py | |
parent | 1cffd5ccff4f4fed205d9257f279f954ee127685 (diff) | |
download | cpython-bfc8fea1e0c46bc0a337237c32b8c1a32985c144.zip cpython-bfc8fea1e0c46bc0a337237c32b8c1a32985c144.tar.gz cpython-bfc8fea1e0c46bc0a337237c32b8c1a32985c144.tar.bz2 |
Re-factor the HTMLParser class to use the new markupbase.ParserBase class.
Use a new internal method, error(), consistently to raise parse errors;
the new base class also uses this.
Diffstat (limited to 'Lib/HTMLParser.py')
-rw-r--r-- | Lib/HTMLParser.py | 324 |
1 files changed, 19 insertions, 305 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index f54e3d6..08c53b3 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -8,6 +8,7 @@ # and CDATA (character data -- only end tags are special). +import markupbase import re import string @@ -21,12 +22,8 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') -piopen = re.compile(r'<\?') piclose = re.compile('>') endtagopen = re.compile('</') -declopen = re.compile('<!') -special = re.compile('<![^<>]*>') -commentopen = re.compile('<!--') commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile( @@ -47,13 +44,9 @@ locatestarttagend = re.compile(r""" )* \s* # trailing whitespace """, re.VERBOSE) -endstarttag = re.compile(r"\s*/?>") endendtag = re.compile('>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') -declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*') -declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*') - class HTMLParseError(Exception): """Exception raised for all parse errors.""" @@ -73,7 +66,7 @@ class HTMLParseError(Exception): return result -class HTMLParser: +class HTMLParser(markupbase.ParserBase): """Find tags and other markup and call handler functions. Usage: @@ -105,9 +98,8 @@ class HTMLParser: self.rawdata = '' self.stack = [] self.lasttag = '???' - self.lineno = 1 - self.offset = 0 self.interesting = interesting_normal + markupbase.ParserBase.reset(self) def feed(self, data): """Feed data to the parser. @@ -122,26 +114,8 @@ class HTMLParser: """Handle any buffered data.""" self.goahead(1) - # Internal -- update line number and offset. This should be - # called for each piece of data exactly once, in order -- in other - # words the concatenation of all the input strings to this - # function should be exactly the entire input. - def updatepos(self, i, j): - if i >= j: - return j - rawdata = self.rawdata - nlines = string.count(rawdata, "\n", i, j) - if nlines: - self.lineno = self.lineno + nlines - pos = string.rindex(rawdata, "\n", i, j) # Should not fail - self.offset = j-(pos+1) - else: - self.offset = self.offset + j-i - return j - - def getpos(self): - """Return current line number and offset.""" - return self.lineno, self.offset + def error(self, message): + raise HTMLParseError(message, self.getpos()) __starttag_text = None @@ -178,11 +152,11 @@ class HTMLParser: k = self.parse_endtag(i) if k >= 0: self.clear_cdata_mode() - elif commentopen.match(rawdata, i): # <!-- + elif rawdata.startswith("<!--", i): # <!-- k = self.parse_comment(i) - elif piopen.match(rawdata, i): # <? + elif rawdata.startswith("<?", i): # <? k = self.parse_pi(i) - elif declopen.match(rawdata, i): # <! + elif rawdata.startswith("<!", i): # <! k = self.parse_declaration(i) elif (i + 1) < n: self.handle_data("<") @@ -191,8 +165,7 @@ class HTMLParser: break if k < 0: if end: - raise HTMLParseError("EOF in middle of construct", - self.getpos()) + self.error("EOF in middle of construct") break i = self.updatepos(i, k) elif rawdata[i:i+2] == "&#": @@ -222,9 +195,7 @@ class HTMLParser: # match.group() will contain at least 2 chars rest = rawdata[i:] if end and match.group() == rest: - raise HTMLParseError( - "EOF in middle of entity or char ref", - self.getpos()) + self.error("EOF in middle of entity or char ref") # incomplete break elif (i + 1) < n: @@ -255,263 +226,6 @@ class HTMLParser: j = match.end() return j - # Internal -- parse declaration. - def parse_declaration(self, i): - # This is some sort of declaration; in "HTML as - # deployed," this should only be the document type - # declaration ("<!DOCTYPE html...>"). - rawdata = self.rawdata - j = i + 2 - assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" - if rawdata[j:j+1] in ("-", ""): - # Start of comment followed by buffer boundary, - # or just a buffer boundary. - return -1 - # in practice, this should look like: ((name|stringlit) S*)+ '>' - n = len(rawdata) - decltype, j = self.scan_name(j, i) - if j < 0: - return j - if decltype.lower() != "doctype": - raise HTMLParseError("unknown declaration: '%s'" % decltype, - self.getpos()) - while j < n: - c = rawdata[j] - if c == ">": - # end of declaration syntax - data = rawdata[i+2:j] - self.handle_decl(data) - return j + 1 - if c in "\"'": - m = declstringlit.match(rawdata, j) - if not m: - return -1 # incomplete - j = m.end() - elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": - name, j = self.scan_name(j, i) - elif c == "[" and decltype == "doctype": - j = self.parse_doctype_subset(j + 1, i) - else: - raise HTMLParseError( - "unexpected char in declaration: %s" % `rawdata[j]`, - self.getpos()) - if j < 0: - return j - return -1 # incomplete - - # Internal -- scan past the internal subset in a <!DOCTYPE declaration, - # returning the index just past any whitespace following the trailing ']'. - def parse_doctype_subset(self, i, declstartpos): - rawdata = self.rawdata - n = len(rawdata) - j = i - while j < n: - c = rawdata[j] - if c == "<": - s = rawdata[j:j+2] - if s == "<": - # end of buffer; incomplete - return -1 - if s != "<!": - self.updatepos(declstartpos, j + 1) - raise HTMLParseError("unexpect char in internal subset", - self.getpos()) - if (j + 2) == n: - # end of buffer; incomplete - return -1 - if (j + 4) > n: - # end of buffer; incomplete - return -1 - if rawdata[j:j+4] == "<!--": - j = self.parse_comment(j, report=0) - if j < 0: - return j - continue - name, j = self.scan_name(j + 2, declstartpos) - if j == -1: - return -1 - if name not in ("attlist", "element", "entity", "notation"): - self.updatepos(declstartpos, j + 2) - raise HTMLParseError( - "unknown declaration %s in internal subset" % `name`, - self.getpos()) - # handle the individual names - meth = getattr(self, "parse_doctype_" + name) - j = meth(j, declstartpos) - if j < 0: - return j - elif c == "%": - # parameter entity reference - if (j + 1) == n: - # end of buffer; incomplete - return -1 - s, j = self.scan_name(j + 1, declstartpos) - if j < 0: - return j - if rawdata[j] == ";": - j = j + 1 - elif c == "]": - j = j + 1 - while j < n and rawdata[j] in string.whitespace: - j = j + 1 - if j < n: - if rawdata[j] == ">": - return j - self.updatepos(declstartpos, j) - raise HTMLParseError( - "unexpected char after internal subset", - self.getpos()) - else: - return -1 - elif c in string.whitespace: - j = j + 1 - else: - self.updatepos(declstartpos, j) - raise HTMLParseError( - "unexpected char %s in internal subset" % `c`, - self.getpos()) - # end of buffer reached - return -1 - - def parse_doctype_element(self, i, declstartpos): - rawdata = self.rawdata - n = len(rawdata) - name, j = self.scan_name(i, declstartpos) - if j == -1: - return -1 - # style content model; just skip until '>' - if '>' in rawdata[j:]: - return string.find(rawdata, ">", j) + 1 - return -1 - - def parse_doctype_attlist(self, i, declstartpos): - rawdata = self.rawdata - name, j = self.scan_name(i, declstartpos) - c = rawdata[j:j+1] - if c == "": - return -1 - if c == ">": - return j + 1 - while 1: - # scan a series of attribute descriptions; simplified: - # name type [value] [#constraint] - name, j = self.scan_name(j, declstartpos) - if j < 0: - return j - c = rawdata[j:j+1] - if c == "": - return -1 - if c == "(": - # an enumerated type; look for ')' - if ")" in rawdata[j:]: - j = string.find(rawdata, ")", j) + 1 - else: - return -1 - while rawdata[j:j+1] in string.whitespace: - j = j + 1 - if not rawdata[j:]: - # end of buffer, incomplete - return -1 - else: - name, j = self.scan_name(j, declstartpos) - c = rawdata[j:j+1] - if not c: - return -1 - if c in "'\"": - m = declstringlit.match(rawdata, j) - if m: - j = m.end() - else: - return -1 - c = rawdata[j:j+1] - if not c: - return -1 - if c == "#": - if rawdata[j:] == "#": - # end of buffer - return -1 - name, j = self.scan_name(j + 1, declstartpos) - if j < 0: - return j - c = rawdata[j:j+1] - if not c: - return -1 - if c == '>': - # all done - return j + 1 - - def parse_doctype_notation(self, i, declstartpos): - name, j = self.scan_name(i, declstartpos) - if j < 0: - return j - rawdata = self.rawdata - while 1: - c = rawdata[j:j+1] - if not c: - # end of buffer; incomplete - return -1 - if c == '>': - return j + 1 - if c in "'\"": - m = declstringlit.match(rawdata, j) - if not m: - return -1 - j = m.end() - else: - name, j = self.scan_name(j, declstartpos) - if j < 0: - return j - - def parse_doctype_entity(self, i, declstartpos): - rawdata = self.rawdata - if rawdata[i:i+1] == "%": - j = i + 1 - while 1: - c = rawdata[j:j+1] - if not c: - return -1 - if c in string.whitespace: - j = j + 1 - else: - break - else: - j = i - name, j = self.scan_name(j, declstartpos) - if j < 0: - return j - while 1: - c = self.rawdata[j:j+1] - if not c: - return -1 - if c in "'\"": - m = declstringlit.match(rawdata, j) - if m: - j = m.end() - else: - return -1 # incomplete - elif c == ">": - return j + 1 - else: - name, j = self.scan_name(j, declstartpos) - if j < 0: - return j - - def scan_name(self, i, declstartpos): - rawdata = self.rawdata - n = len(rawdata) - if i == n: - return None, -1 - m = declname.match(rawdata, i) - if m: - s = m.group() - name = s.strip() - if (i + len(s)) == n: - return None, -1 # end of buffer - return name.lower(), m.end() - else: - self.updatepos(declstartpos, i) - raise HTMLParseError("expected name token", self.getpos()) - # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata @@ -563,9 +277,8 @@ class HTMLParser: - string.rfind(self.__starttag_text, "\n") else: offset = offset + len(self.__starttag_text) - raise HTMLParseError("junk characters in start tag: %s" - % `rawdata[k:endpos][:20]`, - (lineno, offset)) + self.error("junk characters in start tag: %s" + % `rawdata[k:endpos][:20]`) if end[-2:] == '/>': # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) @@ -594,8 +307,7 @@ class HTMLParser: return -1 # else bogus input self.updatepos(i, j + 1) - raise HTMLParseError("malformed empty start tag", - self.getpos()) + self.error("malformed empty start tag") if next == "": # end of input return -1 @@ -605,8 +317,8 @@ class HTMLParser: # '/' from a '/>' ending return -1 self.updatepos(i, j) - raise HTMLParseError("malformed start tag", self.getpos()) - raise AssertionError("we should not gt here!") + self.error("malformed start tag") + raise AssertionError("we should not get here!") # Internal -- parse endtag, return end or -1 if incomplete def parse_endtag(self, i): @@ -618,8 +330,7 @@ class HTMLParser: j = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: - raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`, - self.getpos()) + self.error("bad end tag: %s" % `rawdata[i:j]`) tag = match.group(1) self.handle_endtag(string.lower(tag)) return j @@ -661,6 +372,9 @@ class HTMLParser: def handle_pi(self, data): pass + def unknown_decl(self, data): + self.error("unknown declaration: " + `data`) + # Internal -- helper to remove special character quoting def unescape(self, s): if '&' not in s: |