diff options
author | Guido van Rossum <guido@python.org> | 1998-05-28 22:48:53 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1998-05-28 22:48:53 (GMT) |
commit | 1ad00717fb90ebfe1bb7727f9a89b1b06ebf9e5f (patch) | |
tree | 2dfe14bc7499fa5921763270f3a609f7b5087f55 /Lib/sgmllib.py | |
parent | ae621ff7b7226d093f944fbf27876e6750e768fc (diff) | |
download | cpython-1ad00717fb90ebfe1bb7727f9a89b1b06ebf9e5f.zip cpython-1ad00717fb90ebfe1bb7727f9a89b1b06ebf9e5f.tar.gz cpython-1ad00717fb90ebfe1bb7727f9a89b1b06ebf9e5f.tar.bz2 |
Patch by Lars Marius Garshol:
- Handle <? processing instructions >.
- Allow . and - in entity names.
Also fixed an oversight in the previous fix (in one place, [ \t\r\n]
was used instead of string.whitespace).
Diffstat (limited to 'Lib/sgmllib.py')
-rw-r--r-- | Lib/sgmllib.py | 32 |
1 files changed, 30 insertions, 2 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 956341c..6817608 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -20,12 +20,14 @@ incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' '/([a-zA-Z][^<>]*)?|' '![^<>]*)?') -entityref = re.compile('&([a-zA-Z][a-zA-Z0-9]*)[^a-zA-Z0-9]') +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#([0-9]+)[^0-9]') starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][a-zA-Z0-9]*)/([^/]*)/') +piopen = re.compile('<\?') +piclose = re.compile('>') endtagopen = re.compile('</[<>a-zA-Z]') endbracket = re.compile('[<>]') special = re.compile('<![^<>]*>') @@ -33,7 +35,7 @@ commentopen = re.compile('<!--') commentclose = re.compile('--[%s]*>' % string.whitespace) tagfind = re.compile('[a-zA-Z][a-zA-Z0-9]*') attrfind = re.compile( - '[ \t\n\r]+([a-zA-Z_][-.a-zA-Z_0-9]*)' + '[%s]+([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace)) + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?') @@ -127,6 +129,15 @@ class SGMLParser: if k < 0: break i = i+k continue + if piopen.match(rawdata, i): + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_pi(i) + if k < 0: break + i = i+k + continue match = special.match(rawdata, i) if match: if self.literal: @@ -184,6 +195,19 @@ class SGMLParser: j = match.end(0) return j-i + # Internal -- parse processing instr, return length or -1 if not terminated + def parse_pi(self, i): + rawdata = self.rawdata + if rawdata[i:i+2] <> '<?': + raise RuntimeError, 'unexpected call to handle_pi' + match = piclose.search(rawdata, i+2) + if not match: + return -1 + j = match.start(0) + self.handle_pi(rawdata[i+2: j]) + j = match.end(0) + return j-i + # Internal -- handle starttag, return length or -1 if not terminated def parse_starttag(self, i): rawdata = self.rawdata @@ -348,6 +372,10 @@ class SGMLParser: def handle_comment(self, data): pass + # Example -- handle processing instruction, could be overridden + def handle_pi(self, data): + pass + # To be overridden -- handlers for unknown objects def unknown_starttag(self, tag, attrs): pass def unknown_endtag(self, tag): pass |