[Old patch that hadn't been checked in.]

get_starttag_text(): New method. Return the text of the most recently parsed start tag, from the '<' to the '>' or '/'. Not really useful for structure processing, but requested for Web-related use. May also be useful for being able to re-generate the input from the parse events, but there's no equivalent for end tags. attrfind: Be a little more forgiving of unquoted attribute values.
author: Fred Drake <fdrake@acm.org> 2000-06-29 18:50:59 (GMT)
committer: Fred Drake <fdrake@acm.org> 2000-06-29 18:50:59 (GMT)
commit: b46696c0ed640992b4524aab888a26a56d993142 (patch)
tree: 273dd36d600f659ccb9909ea2ff9106f80a0df57 /Lib/sgmllib.py
parent: 8094611eb8abe9f9d1e1498f36324eebabaa0a09 (diff)
download: cpython-b46696c0ed640992b4524aab888a26a56d993142.zip
cpython-b46696c0ed640992b4524aab888a26a56d993142.tar.gz
cpython-b46696c0ed640992b4524aab888a26a56d993142.tar.bz2
1 files changed, 11 insertions, 2 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 8be7d55..d7e8319 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -37,7 +37,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
 attrfind = re.compile(
     '[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
     + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
-    + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?')
+    + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!&$\(\)_#=~]*))?')
 
 
 # SGML parser base class -- find tags and call handler functions.
@@ -207,9 +207,15 @@ class SGMLParser:
         self.handle_pi(rawdata[i+2: j])
         j = match.end(0)
         return j-i
+
+    __starttag_text = None
+    def get_starttag_text(self):
+        return self.__starttag_text
     
     # Internal -- handle starttag, return length or -1 if not terminated
     def parse_starttag(self, i):
+        self.__starttag_text = None
+        start_pos = i
         rawdata = self.rawdata
         if shorttagopen.match(rawdata, i):
             # SGML shorthand: <tag/data/ == <tag>data</tag>
@@ -220,9 +226,11 @@ class SGMLParser:
             if not match:
                 return -1
             tag, data = match.group(1, 2)
+            self.__starttag_text = '<%s/' % tag
             tag = string.lower(tag)
-            self.finish_shorttag(tag, data)
             k = match.end(0)
+            self.finish_shorttag(tag, data)
+            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
             return k
         # XXX The following should skip matching quotes (' or ")
         match = endbracket.search(rawdata, i+1)
@@ -255,6 +263,7 @@ class SGMLParser:
             k = match.end(0)
         if rawdata[j] == '>':
             j = j+1
+        self.__starttag_text = rawdata[start_pos:j]
         self.finish_starttag(tag, attrs)
         return j
author	Fred Drake <fdrake@acm.org>	2000-06-29 18:50:59 (GMT)
committer	Fred Drake <fdrake@acm.org>	2000-06-29 18:50:59 (GMT)
commit	b46696c0ed640992b4524aab888a26a56d993142 (patch)
tree	273dd36d600f659ccb9909ea2ff9106f80a0df57 /Lib/sgmllib.py
parent	8094611eb8abe9f9d1e1498f36324eebabaa0a09 (diff)
download	cpython-b46696c0ed640992b4524aab888a26a56d993142.zip cpython-b46696c0ed640992b4524aab888a26a56d993142.tar.gz cpython-b46696c0ed640992b4524aab888a26a56d993142.tar.bz2