summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorNeal Norwitz <nnorwitz@gmail.com>2006-09-11 04:24:09 (GMT)
committerNeal Norwitz <nnorwitz@gmail.com>2006-09-11 04:24:09 (GMT)
commitbcc119a22ca98facc80e7350b3ffca3335c9d288 (patch)
tree01e60cdc702c9b36dd9cdb437b12a5a349bd3a40 /Lib
parentca2ca79d23645eb2ee457f64506d05f232c673c9 (diff)
downloadcpython-bcc119a22ca98facc80e7350b3ffca3335c9d288.zip
cpython-bcc119a22ca98facc80e7350b3ffca3335c9d288.tar.gz
cpython-bcc119a22ca98facc80e7350b3ffca3335c9d288.tar.bz2
Forward port of 51850 from release25-maint branch.
As mentioned on python-dev, reverting patch #1504333 because it introduced an infinite loop in rev 47154. This patch also adds a test to prevent the regression.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/sgmllib.py19
-rw-r--r--Lib/test/sgml_input.html212
-rw-r--r--Lib/test/test_sgmllib.py28
3 files changed, 234 insertions, 25 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 3020d11..3ab57c2 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -29,12 +29,7 @@ starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
-starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
- r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
- r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
- r')*\s*/?\s*(?=[<>])')
-endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
+endbracket = re.compile('[<>]')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -254,10 +249,14 @@ class SGMLParser(markupbase.ParserBase):
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
- match = starttag.match(rawdata, i)
+ # XXX The following should skip matching quotes (' or ")
+ # As a shortcut way to exit, this isn't so bad, but shouldn't
+ # be used to locate the actual end of the start tag since the
+ # < or > characters may be embedded in an attribute value.
+ match = endbracket.search(rawdata, i+1)
if not match:
return -1
- j = match.end(0)
+ j = match.start(0)
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if rawdata[i:i+2] == '<>':
@@ -306,10 +305,10 @@ class SGMLParser(markupbase.ParserBase):
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
- match = endtag.match(rawdata, i)
+ match = endbracket.search(rawdata, i+1)
if not match:
return -1
- j = match.end(0)
+ j = match.start(0)
tag = rawdata[i+2:j].strip().lower()
if rawdata[j] == '>':
j = j+1
diff --git a/Lib/test/sgml_input.html b/Lib/test/sgml_input.html
new file mode 100644
index 0000000..f4d2e6c
--- /dev/null
+++ b/Lib/test/sgml_input.html
@@ -0,0 +1,212 @@
+<html>
+ <head>
+ <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
+ <link rel="stylesheet" type="text/css" href="http://ogame182.de/epicblue/formate.css">
+ <script language="JavaScript" src="js/flotten.js"></script>
+ </head>
+ <body>
+ <script language=JavaScript> if (parent.frames.length == 0) { top.location.href = "http://es.ogame.org/"; } </script> <script language="JavaScript">
+function haha(z1) {
+ eval("location='"+z1.options[z1.selectedIndex].value+"'");
+}
+</script>
+<center>
+<table>
+ <tr>
+ <td></td>
+ <td>
+ <center>
+ <table>
+ <tr>
+ <td><img src="http://ogame182.de/epicblue/planeten/small/s_dschjungelplanet04.jpg" width="50" height="50"></td>
+ <td>
+ <table border="1">
+ <select size="1" onchange="haha(this)">
+ <option value="/game/flotten1.php?session=8912ae912fec&cp=33875341&mode=Flotte&gid=&messageziel=&re=0" selected>Alien sex friend [2:250:6]</option>
+ <option value="/game/flotten1.php?session=8912ae912fec&cp=33905100&mode=Flotte&gid=&messageziel=&re=0" >1989 [2:248:14]</option>
+ <option value="/game/flotten1.php?session=8912ae912fec&cp=34570808&mode=Flotte&gid=&messageziel=&re=0" >1990 [2:248:6]</option>
+ <option value="/game/flotten1.php?session=8912ae912fec&cp=34570858&mode=Flotte&gid=&messageziel=&re=0" >1991 [2:254:6]</option>
+ <option value="/game/flotten1.php?session=8912ae912fec&cp=34572929&mode=Flotte&gid=&messageziel=&re=0" >Colonia [2:253:12]</option>
+ </select>
+ </table>
+ </td>
+ </tr>
+ </table>
+ </center>
+ </td>
+ <td>
+ <table border="0" width="100%" cellspacing="0" cellpadding="0">
+ <tr>
+ <td align="center"></td>
+ <td align="center" width="85">
+ <img border="0" src="http://ogame182.de/epicblue/images/metall.gif" width="42" height="22">
+ </td>
+ <td align="center" width="85">
+ <img border="0" src="http://ogame182.de/epicblue/images/kristall.gif" width="42" height="22">
+ </td>
+ <td align="center" width="85">
+ <img border="0" src="http://ogame182.de/epicblue/images/deuterium.gif" width="42" height="22">
+ </td>
+ <td align="center" width="85">
+ <img border="0" src="http://ogame182.de/epicblue/images/energie.gif" width="42" height="22">
+ </td>
+ <td align="center"></td>
+ </tr>
+ <tr>
+ <td align="center"><i><b>&nbsp;&nbsp;</b></i></td>
+ <td align="center" width="85"><i><b><font color="#ffffff">Metal</font></b></i></td>
+ <td align="center" width="85"><i><b><font color="#ffffff">Cristal</font></b></i></td>
+ <td align="center" width="85"><i><b><font color="#ffffff">Deuterio</font></b></i></td>
+ <td align="center" width="85"><i><b><font color="#ffffff">Energía</font></b></i></td>
+ <td align="center"><i><b>&nbsp;&nbsp;</b></i></td>
+ </tr>
+ <tr>
+ <td align="center"></td>
+ <td align="center" width="85">160.636</td>
+ <td align="center" width="85">3.406</td>
+ <td align="center" width="85">39.230</td>
+ <td align="center" width="85"><font color=#ff0000>-80</font>/3.965</td>
+ <td align="center"></td>
+ </tr>
+ </table>
+ </tr>
+ </table>
+ </center>
+<br />
+ <script language="JavaScript">
+ <!--
+ function link_to_gamepay() {
+ self.location = "https://www.gamepay.de/?lang=es&serverID=8&userID=129360&gameID=ogame&gui=v2&chksum=a9751afa9e37e6b1b826356bcca45675";
+ }
+//-->
+ </script>
+<center>
+ <table width="519" border="0" cellpadding="0" cellspacing="1">
+ <tr height="20">
+ <td colspan="8" class="c">Flotas (max. 9)</td>
+ </tr>
+ <tr height="20">
+ <th>Num.</th>
+ <th>Misión</th>
+ <th>Cantidad</th>
+ <th>Comienzo</th>
+ <th>Salida</th>
+ <th>Objetivo</th>
+ <th>Llegada</th>
+ <th>Orden</th>
+ </tr>
+ <tr height="20">
+ <th>1</th>
+ <th>
+ <a title="">Espionaje</a>
+ <a title="Flota en el planeta">(F)</a>
+ </th>
+ <th> <a title="Sonda de espionaje: 3
+">3</a></th>
+ <th>[2:250:6]</th>
+ <th>Wed Aug 9 18:00:02</th>
+ <th>[2:242:5]</th>
+ <th>Wed Aug 9 18:01:02</th>
+ <th>
+ <form action="flotten1.php?session=8912ae912fec" method="POST">
+ <input type="hidden" name="order_return" value="25054490" />
+ <input type="submit" value="Enviar de regreso" />
+ </form>
+ </th>
+ </tr>
+ <tr height="20">
+ <th>2</th>
+ <th>
+ <a title="">Espionaje</a>
+ <a title="Volver al planeta">(V)</a>
+ </th>
+ <th> <a title="Sonda de espionaje: 3
+">3</a></th>
+ <th>[2:250:6]</th>
+ <th>Wed Aug 9 17:59:55</th>
+ <th>[2:242:1]</th>
+ <th>Wed Aug 9 18:01:55</th>
+ <th>
+ </th>
+ </tr>
+ </table>
+
+
+
+<form action="flotten2.php?session=8912ae912fec" method="POST">
+ <table width="519" border="0" cellpadding="0" cellspacing="1">
+ <tr height="20">
+ <td colspan="4" class="c">Nueva misión: elegir naves</td>
+ </tr>
+ <tr height="20">
+ <th>Naves</th>
+ <th>Disponibles</th>
+<!-- <th>Gesch.</th> -->
+ <th>-</th>
+ <th>-</th>
+ </tr>
+ <tr height="20">
+ <th><a title="Velocidad: 8500">Nave pequeña de carga</a></th>
+ <th>10<input type="hidden" name="maxship202" value="10"/></th>
+<!-- <th>8500 -->
+ <input type="hidden" name="consumption202" value="10"/>
+ <input type="hidden" name="speed202" value="8500" /></th>
+ <input type="hidden" name="capacity202" value="5000" /></th>
+ <th><a href="javascript:maxShip('ship202');" >máx</a> </th>
+ <th><input name="ship202" size="10" value="0" alt="Nave pequeña de carga 10"/></th>
+ </tr>
+ <tr height="20">
+ <th><a title="Velocidad: 12750">Nave grande de carga</a></th>
+ <th>19<input type="hidden" name="maxship203" value="19"/></th>
+<!-- <th>12750 -->
+ <input type="hidden" name="consumption203" value="50"/>
+ <input type="hidden" name="speed203" value="12750" /></th>
+ <input type="hidden" name="capacity203" value="25000" /></th>
+ <th><a href="javascript:maxShip('ship203');" >máx</a> </th>
+ <th><input name="ship203" size="10" value="0" alt="Nave grande de carga 19"/></th>
+ </tr>
+ <tr height="20">
+ <th><a title="Velocidad: 27000">Crucero</a></th>
+ <th>6<input type="hidden" name="maxship206" value="6"/></th>
+<!-- <th>27000 -->
+ <input type="hidden" name="consumption206" value="300"/>
+ <input type="hidden" name="speed206" value="27000" /></th>
+ <input type="hidden" name="capacity206" value="800" /></th>
+ <th><a href="javascript:maxShip('ship206');" >máx</a> </th>
+ <th><input name="ship206" size="10" value="0" alt="Crucero 6"/></th>
+ </tr>
+ <tr height="20">
+ <th><a title="Velocidad: 3400">Reciclador</a></th>
+ <th>1<input type="hidden" name="maxship209" value="1"/></th>
+<!-- <th>3400 -->
+ <input type="hidden" name="consumption209" value="300"/>
+ <input type="hidden" name="speed209" value="3400" /></th>
+ <input type="hidden" name="capacity209" value="20000" /></th>
+ <th><a href="javascript:maxShip('ship209');" >máx</a> </th>
+ <th><input name="ship209" size="10" value="0" alt="Reciclador 1"/></th>
+ </tr>
+ <tr height="20">
+ <th><a title="Velocidad: 170000000">Sonda de espionaje</a></th>
+ <th>139<input type="hidden" name="maxship210" value="139"/></th>
+<!-- <th>170000000 -->
+ <input type="hidden" name="consumption210" value="1"/>
+ <input type="hidden" name="speed210" value="170000000" /></th>
+ <input type="hidden" name="capacity210" value="5" /></th>
+ <th><a href="javascript:maxShip('ship210');" >máx</a> </th>
+ <th><input name="ship210" size="10" value="0" alt="Sonda de espionaje 139"/></th>
+ </tr>
+ <tr height="20">
+ <th colspan="2"><a href="javascript:noShips();" >Ninguna nave</a></th>
+ <th colspan="2"><a href="javascript:maxShips();" >Todas las naves</a></th>
+ </tr>
+ <tr height="20">
+ <th colspan="4"><input type="submit" value="Continuar" /></th>
+ </tr>
+<tr><th colspan=4>
+<iframe id='a44fb522' name='a44fb522' src='http://ads.gameforgeads.de/adframe.php?n=a44fb522&amp;what=zone:578' framespacing='0' frameborder='no' scrolling='no' width='468' height='60'></iframe>
+<br><center></center></br>
+</th></tr>
+</form>
+</table>
+ </body>
+</html>
diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py
index 28a21a4..b698636 100644
--- a/Lib/test/test_sgmllib.py
+++ b/Lib/test/test_sgmllib.py
@@ -286,21 +286,6 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
('codepoint', 'convert', 42),
])
- def test_attr_values_quoted_markup(self):
- """Multi-line and markup in attribute values"""
- self.check_events("""<a title='foo\n<br>bar'>text</a>""",
- [("starttag", "a", [("title", "foo\n<br>bar")]),
- ("data", "text"),
- ("endtag", "a")])
- self.check_events("""<a title='less < than'>text</a>""",
- [("starttag", "a", [("title", "less < than")]),
- ("data", "text"),
- ("endtag", "a")])
- self.check_events("""<a title='greater > than'>text</a>""",
- [("starttag", "a", [("title", "greater > than")]),
- ("data", "text"),
- ("endtag", "a")])
-
def test_attr_funky_names(self):
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
@@ -376,6 +361,19 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
])
+ def test_read_chunks(self):
+ # SF bug #1541697, this caused sgml parser to hang
+ # Just verify this code doesn't cause a hang.
+ CHUNK = 1024 # increasing this to 8212 makes the problem go away
+
+ f = open(test_support.findfile('sgml_input.html'))
+ fp = sgmllib.SGMLParser()
+ while 1:
+ data = f.read(CHUNK)
+ fp.feed(data)
+ if len(data) != CHUNK:
+ break
+
# XXX These tests have been disabled by prefixing their names with
# an underscore. The first two exercise outstanding bugs in the
# sgmllib module, and the third exhibits questionable behavior