diff options
author | Fred Drake <fdrake@acm.org> | 2001-03-16 20:04:57 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2001-03-16 20:04:57 (GMT) |
commit | 669573726bb31d8862e98b3843549e5fe0b54d37 (patch) | |
tree | 5969d357eebd6a8f790fb13d54a86d8e9a0a9f0a | |
parent | 83e01bf6c84719f8977d1ffd721d75d8eda2ddf8 (diff) | |
download | cpython-669573726bb31d8862e98b3843549e5fe0b54d37.zip cpython-669573726bb31d8862e98b3843549e5fe0b54d37.tar.gz cpython-669573726bb31d8862e98b3843549e5fe0b54d37.tar.bz2 |
Change RuntimeError to SGMLParseError, which subclasses RuntimeError
for backward compatibility.
Add support for SGML declaration syntax (<!....>) to some reasonable
degree. This does not support everything allowed in SGML, but should
work with "real" HTML (internal subset in a DOCTYPE is not handled).
The content of the declaration is passed to the .handle_decl() method,
which can be overridden by subclasses.
-rw-r--r-- | Lib/sgmllib.py | 58 |
1 files changed, 53 insertions, 5 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 19580e2..2ebe698 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -39,6 +39,14 @@ attrfind = re.compile( r'\s*([a-zA-Z_][-.a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?') +declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*') +declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*') + + +class SGMLParseError(RuntimeError): + """Exception raised for all parse errors.""" + pass + # SGML parser base class -- find tags and call handler functions. # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). @@ -144,7 +152,12 @@ class SGMLParser: self.handle_data(rawdata[i]) i = i+1 continue - i = match.end(0) + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). + k = self.parse_declaration(i) + if k < 0: break + i = k continue elif rawdata[i] == '&': match = charref.match(rawdata, i) @@ -162,7 +175,7 @@ class SGMLParser: if rawdata[i-1] != ';': i = i-1 continue else: - raise RuntimeError, 'neither < nor & ??' + raise SGMLParserError('neither < nor & ??') # We get here only if incomplete matches but # nothing else match = incomplete.match(rawdata, i) @@ -186,7 +199,7 @@ class SGMLParser: def parse_comment(self, i): rawdata = self.rawdata if rawdata[i:i+4] != '<!--': - raise RuntimeError, 'unexpected call to handle_comment' + raise SGMLParseError('unexpected call to parse_comment()') match = commentclose.search(rawdata, i+4) if not match: return -1 @@ -195,11 +208,42 @@ class SGMLParser: j = match.end(0) return j-i + # Internal -- parse declaration. + def parse_declaration(self, i): + rawdata = self.rawdata + j = i + 2 + # in practice, this should look like: ((name|stringlit) S*)+ '>' + while 1: + c = rawdata[j:j+1] + if c == ">": + # end of declaration syntax + self.handle_decl(rawdata[i+2:j]) + return j + 1 + if c in "\"'": + m = declstringlit.match(rawdata, j) + if not m: + # incomplete or an error? + return -1 + j = m.end() + elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": + m = declname.match(rawdata, j) + if not m: + # incomplete or an error? + return -1 + j = m.end() + elif i == len(rawdata): + # end of buffer between tokens + return -1 + else: + raise SGMLParseError( + "unexpected char in declaration: %s" % `rawdata[i]`) + assert 0, "can't get here!" + # Internal -- parse processing instr, return length or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata if rawdata[i:i+2] != '<?': - raise RuntimeError, 'unexpected call to handle_pi' + raise SGMLParseError('unexpected call to parse_pi()') match = piclose.search(rawdata, i+2) if not match: return -1 @@ -246,7 +290,7 @@ class SGMLParser: else: match = tagfind.match(rawdata, i+1) if not match: - raise RuntimeError, 'unexpected call to parse_starttag' + raise SGMLParseError('unexpected call to parse_starttag') k = match.end(0) tag = rawdata[i+1:k].lower() self.lasttag = tag @@ -383,6 +427,10 @@ class SGMLParser: def handle_comment(self, data): pass + # Example -- handle declaration, could be overridden + def handle_decl(self, decl): + pass + # Example -- handle processing instruction, could be overridden def handle_pi(self, data): pass |