diff options
author | Neal Norwitz <nnorwitz@gmail.com> | 2006-09-11 04:24:09 (GMT) |
---|---|---|
committer | Neal Norwitz <nnorwitz@gmail.com> | 2006-09-11 04:24:09 (GMT) |
commit | bcc119a22ca98facc80e7350b3ffca3335c9d288 (patch) | |
tree | 01e60cdc702c9b36dd9cdb437b12a5a349bd3a40 | |
parent | ca2ca79d23645eb2ee457f64506d05f232c673c9 (diff) | |
download | cpython-bcc119a22ca98facc80e7350b3ffca3335c9d288.zip cpython-bcc119a22ca98facc80e7350b3ffca3335c9d288.tar.gz cpython-bcc119a22ca98facc80e7350b3ffca3335c9d288.tar.bz2 |
Forward port of 51850 from release25-maint branch.
As mentioned on python-dev, reverting patch #1504333 because it introduced
an infinite loop in rev 47154.
This patch also adds a test to prevent the regression.
-rw-r--r-- | Lib/sgmllib.py | 19 | ||||
-rw-r--r-- | Lib/test/sgml_input.html | 212 | ||||
-rw-r--r-- | Lib/test/test_sgmllib.py | 28 | ||||
-rw-r--r-- | Misc/NEWS | 2 |
4 files changed, 236 insertions, 25 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 3020d11..3ab57c2 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -29,12 +29,7 @@ starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') -starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' - r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' - r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' - r')*\s*/?\s*(?=[<>])') -endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])') +endbracket = re.compile('[<>]') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -254,10 +249,14 @@ class SGMLParser(markupbase.ParserBase): self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - match = starttag.match(rawdata, i) + # XXX The following should skip matching quotes (' or ") + # As a shortcut way to exit, this isn't so bad, but shouldn't + # be used to locate the actual end of the start tag since the + # < or > characters may be embedded in an attribute value. + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) # Now parse the data between i+1 and j into a tag and attrs attrs = [] if rawdata[i:i+2] == '<>': @@ -306,10 +305,10 @@ class SGMLParser(markupbase.ParserBase): # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata - match = endtag.match(rawdata, i) + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) tag = rawdata[i+2:j].strip().lower() if rawdata[j] == '>': j = j+1 diff --git a/Lib/test/sgml_input.html b/Lib/test/sgml_input.html new file mode 100644 index 0000000..f4d2e6c --- /dev/null +++ b/Lib/test/sgml_input.html @@ -0,0 +1,212 @@ +<html> + <head> + <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"> + <link rel="stylesheet" type="text/css" href="http://ogame182.de/epicblue/formate.css"> + <script language="JavaScript" src="js/flotten.js"></script> + </head> + <body> + <script language=JavaScript> if (parent.frames.length == 0) { top.location.href = "http://es.ogame.org/"; } </script> <script language="JavaScript"> +function haha(z1) { + eval("location='"+z1.options[z1.selectedIndex].value+"'"); +} +</script> +<center> +<table> + <tr> + <td></td> + <td> + <center> + <table> + <tr> + <td><img src="http://ogame182.de/epicblue/planeten/small/s_dschjungelplanet04.jpg" width="50" height="50"></td> + <td> + <table border="1"> + <select size="1" onchange="haha(this)"> + <option value="/game/flotten1.php?session=8912ae912fec&cp=33875341&mode=Flotte&gid=&messageziel=&re=0" selected>Alien sex friend [2:250:6]</option> + <option value="/game/flotten1.php?session=8912ae912fec&cp=33905100&mode=Flotte&gid=&messageziel=&re=0" >1989 [2:248:14]</option> + <option value="/game/flotten1.php?session=8912ae912fec&cp=34570808&mode=Flotte&gid=&messageziel=&re=0" >1990 [2:248:6]</option> + <option value="/game/flotten1.php?session=8912ae912fec&cp=34570858&mode=Flotte&gid=&messageziel=&re=0" >1991 [2:254:6]</option> + <option value="/game/flotten1.php?session=8912ae912fec&cp=34572929&mode=Flotte&gid=&messageziel=&re=0" >Colonia [2:253:12]</option> + </select> + </table> + </td> + </tr> + </table> + </center> + </td> + <td> + <table border="0" width="100%" cellspacing="0" cellpadding="0"> + <tr> + <td align="center"></td> + <td align="center" width="85"> + <img border="0" src="http://ogame182.de/epicblue/images/metall.gif" width="42" height="22"> + </td> + <td align="center" width="85"> + <img border="0" src="http://ogame182.de/epicblue/images/kristall.gif" width="42" height="22"> + </td> + <td align="center" width="85"> + <img border="0" src="http://ogame182.de/epicblue/images/deuterium.gif" width="42" height="22"> + </td> + <td align="center" width="85"> + <img border="0" src="http://ogame182.de/epicblue/images/energie.gif" width="42" height="22"> + </td> + <td align="center"></td> + </tr> + <tr> + <td align="center"><i><b> </b></i></td> + <td align="center" width="85"><i><b><font color="#ffffff">Metal</font></b></i></td> + <td align="center" width="85"><i><b><font color="#ffffff">Cristal</font></b></i></td> + <td align="center" width="85"><i><b><font color="#ffffff">Deuterio</font></b></i></td> + <td align="center" width="85"><i><b><font color="#ffffff">Energía</font></b></i></td> + <td align="center"><i><b> </b></i></td> + </tr> + <tr> + <td align="center"></td> + <td align="center" width="85">160.636</td> + <td align="center" width="85">3.406</td> + <td align="center" width="85">39.230</td> + <td align="center" width="85"><font color=#ff0000>-80</font>/3.965</td> + <td align="center"></td> + </tr> + </table> + </tr> + </table> + </center> +<br /> + <script language="JavaScript"> + <!-- + function link_to_gamepay() { + self.location = "https://www.gamepay.de/?lang=es&serverID=8&userID=129360&gameID=ogame&gui=v2&chksum=a9751afa9e37e6b1b826356bcca45675"; + } +//--> + </script> +<center> + <table width="519" border="0" cellpadding="0" cellspacing="1"> + <tr height="20"> + <td colspan="8" class="c">Flotas (max. 9)</td> + </tr> + <tr height="20"> + <th>Num.</th> + <th>Misión</th> + <th>Cantidad</th> + <th>Comienzo</th> + <th>Salida</th> + <th>Objetivo</th> + <th>Llegada</th> + <th>Orden</th> + </tr> + <tr height="20"> + <th>1</th> + <th> + <a title="">Espionaje</a> + <a title="Flota en el planeta">(F)</a> + </th> + <th> <a title="Sonda de espionaje: 3 +">3</a></th> + <th>[2:250:6]</th> + <th>Wed Aug 9 18:00:02</th> + <th>[2:242:5]</th> + <th>Wed Aug 9 18:01:02</th> + <th> + <form action="flotten1.php?session=8912ae912fec" method="POST"> + <input type="hidden" name="order_return" value="25054490" /> + <input type="submit" value="Enviar de regreso" /> + </form> + </th> + </tr> + <tr height="20"> + <th>2</th> + <th> + <a title="">Espionaje</a> + <a title="Volver al planeta">(V)</a> + </th> + <th> <a title="Sonda de espionaje: 3 +">3</a></th> + <th>[2:250:6]</th> + <th>Wed Aug 9 17:59:55</th> + <th>[2:242:1]</th> + <th>Wed Aug 9 18:01:55</th> + <th> + </th> + </tr> + </table> + + + +<form action="flotten2.php?session=8912ae912fec" method="POST"> + <table width="519" border="0" cellpadding="0" cellspacing="1"> + <tr height="20"> + <td colspan="4" class="c">Nueva misión: elegir naves</td> + </tr> + <tr height="20"> + <th>Naves</th> + <th>Disponibles</th> +<!-- <th>Gesch.</th> --> + <th>-</th> + <th>-</th> + </tr> + <tr height="20"> + <th><a title="Velocidad: 8500">Nave pequeña de carga</a></th> + <th>10<input type="hidden" name="maxship202" value="10"/></th> +<!-- <th>8500 --> + <input type="hidden" name="consumption202" value="10"/> + <input type="hidden" name="speed202" value="8500" /></th> + <input type="hidden" name="capacity202" value="5000" /></th> + <th><a href="javascript:maxShip('ship202');" >máx</a> </th> + <th><input name="ship202" size="10" value="0" alt="Nave pequeña de carga 10"/></th> + </tr> + <tr height="20"> + <th><a title="Velocidad: 12750">Nave grande de carga</a></th> + <th>19<input type="hidden" name="maxship203" value="19"/></th> +<!-- <th>12750 --> + <input type="hidden" name="consumption203" value="50"/> + <input type="hidden" name="speed203" value="12750" /></th> + <input type="hidden" name="capacity203" value="25000" /></th> + <th><a href="javascript:maxShip('ship203');" >máx</a> </th> + <th><input name="ship203" size="10" value="0" alt="Nave grande de carga 19"/></th> + </tr> + <tr height="20"> + <th><a title="Velocidad: 27000">Crucero</a></th> + <th>6<input type="hidden" name="maxship206" value="6"/></th> +<!-- <th>27000 --> + <input type="hidden" name="consumption206" value="300"/> + <input type="hidden" name="speed206" value="27000" /></th> + <input type="hidden" name="capacity206" value="800" /></th> + <th><a href="javascript:maxShip('ship206');" >máx</a> </th> + <th><input name="ship206" size="10" value="0" alt="Crucero 6"/></th> + </tr> + <tr height="20"> + <th><a title="Velocidad: 3400">Reciclador</a></th> + <th>1<input type="hidden" name="maxship209" value="1"/></th> +<!-- <th>3400 --> + <input type="hidden" name="consumption209" value="300"/> + <input type="hidden" name="speed209" value="3400" /></th> + <input type="hidden" name="capacity209" value="20000" /></th> + <th><a href="javascript:maxShip('ship209');" >máx</a> </th> + <th><input name="ship209" size="10" value="0" alt="Reciclador 1"/></th> + </tr> + <tr height="20"> + <th><a title="Velocidad: 170000000">Sonda de espionaje</a></th> + <th>139<input type="hidden" name="maxship210" value="139"/></th> +<!-- <th>170000000 --> + <input type="hidden" name="consumption210" value="1"/> + <input type="hidden" name="speed210" value="170000000" /></th> + <input type="hidden" name="capacity210" value="5" /></th> + <th><a href="javascript:maxShip('ship210');" >máx</a> </th> + <th><input name="ship210" size="10" value="0" alt="Sonda de espionaje 139"/></th> + </tr> + <tr height="20"> + <th colspan="2"><a href="javascript:noShips();" >Ninguna nave</a></th> + <th colspan="2"><a href="javascript:maxShips();" >Todas las naves</a></th> + </tr> + <tr height="20"> + <th colspan="4"><input type="submit" value="Continuar" /></th> + </tr> +<tr><th colspan=4> +<iframe id='a44fb522' name='a44fb522' src='http://ads.gameforgeads.de/adframe.php?n=a44fb522&what=zone:578' framespacing='0' frameborder='no' scrolling='no' width='468' height='60'></iframe> +<br><center></center></br> +</th></tr> +</form> +</table> + </body> +</html> diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index 28a21a4..b698636 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -286,21 +286,6 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ('codepoint', 'convert', 42), ]) - def test_attr_values_quoted_markup(self): - """Multi-line and markup in attribute values""" - self.check_events("""<a title='foo\n<br>bar'>text</a>""", - [("starttag", "a", [("title", "foo\n<br>bar")]), - ("data", "text"), - ("endtag", "a")]) - self.check_events("""<a title='less < than'>text</a>""", - [("starttag", "a", [("title", "less < than")]), - ("data", "text"), - ("endtag", "a")]) - self.check_events("""<a title='greater > than'>text</a>""", - [("starttag", "a", [("title", "greater > than")]), - ("data", "text"), - ("endtag", "a")]) - def test_attr_funky_names(self): self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), @@ -376,6 +361,19 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'), ]) + def test_read_chunks(self): + # SF bug #1541697, this caused sgml parser to hang + # Just verify this code doesn't cause a hang. + CHUNK = 1024 # increasing this to 8212 makes the problem go away + + f = open(test_support.findfile('sgml_input.html')) + fp = sgmllib.SGMLParser() + while 1: + data = f.read(CHUNK) + fp.feed(data) + if len(data) != CHUNK: + break + # XXX These tests have been disabled by prefixing their names with # an underscore. The first two exercise outstanding bugs in the # sgmllib module, and the third exhibits questionable behavior @@ -40,6 +40,8 @@ Core and builtins Library ------- +- Reverted patch #1504333 to sgmllib because it introduced an infinite loop. + - Patch #1553314: Fix the inspect.py slowdown that was hurting IPython & SAGE by adding smarter caching in inspect.getmodule() |