As mentioned on python-dev, reverting patch #1504333 because it introduced

an infinite loop in rev 47154. This patch also adds a test to prevent the regression. Will backport to 2.4 and head later.
author: Neal Norwitz <nnorwitz@gmail.com> 2006-09-11 04:05:18 (GMT)
committer: Neal Norwitz <nnorwitz@gmail.com> 2006-09-11 04:05:18 (GMT)
commit: 48829ba61d840cfcc6be63219b62476b4f9c7f7d (patch)
tree: 69ab7600924a3a6dd3ad897117ced744dff8d76d /Lib/sgmllib.py
parent: 2e488fdebf109a13a464dbe76f85b8dbf846c7f7 (diff)
download: cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.zip
cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.tar.gz
cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.tar.bz2
1 files changed, 9 insertions, 10 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 3020d11..3ab57c2 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -29,12 +29,7 @@ starttagopen = re.compile('<[>a-zA-Z]')
 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
 piclose = re.compile('>')
-starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
-        r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
-        r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
-        r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
-    r')*\s*/?\s*(?=[<>])')
-endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
+endbracket = re.compile('[<>]')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -254,10 +249,14 @@ class SGMLParser(markupbase.ParserBase):
             self.finish_shorttag(tag, data)
             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
             return k
-        match = starttag.match(rawdata, i)
+        # XXX The following should skip matching quotes (' or ")
+        # As a shortcut way to exit, this isn't so bad, but shouldn't
+        # be used to locate the actual end of the start tag since the
+        # < or > characters may be embedded in an attribute value.
+        match = endbracket.search(rawdata, i+1)
         if not match:
             return -1
-        j = match.end(0)
+        j = match.start(0)
         # Now parse the data between i+1 and j into a tag and attrs
         attrs = []
         if rawdata[i:i+2] == '<>':
@@ -306,10 +305,10 @@ class SGMLParser(markupbase.ParserBase):
     # Internal -- parse endtag
     def parse_endtag(self, i):
         rawdata = self.rawdata
-        match = endtag.match(rawdata, i)
+        match = endbracket.search(rawdata, i+1)
         if not match:
             return -1
-        j = match.end(0)
+        j = match.start(0)
         tag = rawdata[i+2:j].strip().lower()
         if rawdata[j] == '>':
             j = j+1
author	Neal Norwitz <nnorwitz@gmail.com>	2006-09-11 04:05:18 (GMT)
committer	Neal Norwitz <nnorwitz@gmail.com>	2006-09-11 04:05:18 (GMT)
commit	48829ba61d840cfcc6be63219b62476b4f9c7f7d (patch)
tree	69ab7600924a3a6dd3ad897117ced744dff8d76d /Lib/sgmllib.py
parent	2e488fdebf109a13a464dbe76f85b8dbf846c7f7 (diff)
download	cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.zip cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.tar.gz cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.tar.bz2