diff options
author | Fred Drake <fdrake@acm.org> | 2000-06-29 18:50:59 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2000-06-29 18:50:59 (GMT) |
commit | b46696c0ed640992b4524aab888a26a56d993142 (patch) | |
tree | 273dd36d600f659ccb9909ea2ff9106f80a0df57 | |
parent | 8094611eb8abe9f9d1e1498f36324eebabaa0a09 (diff) | |
download | cpython-b46696c0ed640992b4524aab888a26a56d993142.zip cpython-b46696c0ed640992b4524aab888a26a56d993142.tar.gz cpython-b46696c0ed640992b4524aab888a26a56d993142.tar.bz2 |
[Old patch that hadn't been checked in.]
get_starttag_text(): New method.
Return the text of the most recently parsed start tag, from
the '<' to the '>' or '/'. Not really useful for structure
processing, but requested for Web-related use. May also be
useful for being able to re-generate the input from the parse
events, but there's no equivalent for end tags.
attrfind: Be a little more forgiving of unquoted attribute values.
-rw-r--r-- | Lib/sgmllib.py | 13 |
1 files changed, 11 insertions, 2 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 8be7d55..d7e8319 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -37,7 +37,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*') attrfind = re.compile( '[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace)) - + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?') + + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!&$\(\)_#=~]*))?') # SGML parser base class -- find tags and call handler functions. @@ -207,9 +207,15 @@ class SGMLParser: self.handle_pi(rawdata[i+2: j]) j = match.end(0) return j-i + + __starttag_text = None + def get_starttag_text(self): + return self.__starttag_text # Internal -- handle starttag, return length or -1 if not terminated def parse_starttag(self, i): + self.__starttag_text = None + start_pos = i rawdata = self.rawdata if shorttagopen.match(rawdata, i): # SGML shorthand: <tag/data/ == <tag>data</tag> @@ -220,9 +226,11 @@ class SGMLParser: if not match: return -1 tag, data = match.group(1, 2) + self.__starttag_text = '<%s/' % tag tag = string.lower(tag) - self.finish_shorttag(tag, data) k = match.end(0) + self.finish_shorttag(tag, data) + self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k # XXX The following should skip matching quotes (' or ") match = endbracket.search(rawdata, i+1) @@ -255,6 +263,7 @@ class SGMLParser: k = match.end(0) if rawdata[j] == '>': j = j+1 + self.__starttag_text = rawdata[start_pos:j] self.finish_starttag(tag, attrs) return j |