From bcc119a22ca98facc80e7350b3ffca3335c9d288 Mon Sep 17 00:00:00 2001 From: Neal Norwitz Date: Mon, 11 Sep 2006 04:24:09 +0000 Subject: Forward port of 51850 from release25-maint branch. As mentioned on python-dev, reverting patch #1504333 because it introduced an infinite loop in rev 47154. This patch also adds a test to prevent the regression. --- Lib/sgmllib.py | 19 ++--- Lib/test/sgml_input.html | 212 +++++++++++++++++++++++++++++++++++++++++++++++ Lib/test/test_sgmllib.py | 28 +++---- Misc/NEWS | 2 + 4 files changed, 236 insertions(+), 25 deletions(-) create mode 100644 Lib/test/sgml_input.html diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 3020d11..3ab57c2 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -29,12 +29,7 @@ starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') -starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' - r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' - r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' - r')*\s*/?\s*(?=[<>])') -endtag = re.compile(r'])') +endbracket = re.compile('[<>]') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -254,10 +249,14 @@ class SGMLParser(markupbase.ParserBase): self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - match = starttag.match(rawdata, i) + # XXX The following should skip matching quotes (' or ") + # As a shortcut way to exit, this isn't so bad, but shouldn't + # be used to locate the actual end of the start tag since the + # < or > characters may be embedded in an attribute value. + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) # Now parse the data between i+1 and j into a tag and attrs attrs = [] if rawdata[i:i+2] == '<>': @@ -306,10 +305,10 @@ class SGMLParser(markupbase.ParserBase): # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata - match = endtag.match(rawdata, i) + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) tag = rawdata[i+2:j].strip().lower() if rawdata[j] == '>': j = j+1 diff --git a/Lib/test/sgml_input.html b/Lib/test/sgml_input.html new file mode 100644 index 0000000..f4d2e6c --- /dev/null +++ b/Lib/test/sgml_input.html @@ -0,0 +1,212 @@ + + + + + + + + +
+ + + + + +
+
+ + + + + +
+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
  MetalCristalDeuterioEnergía  
160.6363.40639.230-80/3.965
+
+
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Flotas (max. 9)
Num.MisiónCantidadComienzoSalidaObjetivoLlegadaOrden
1 + Espionaje + (F) + 3[2:250:6]Wed Aug 9 18:00:02[2:242:5]Wed Aug 9 18:01:02 +
+ + +
+
2 + Espionaje + (V) + 3[2:250:6]Wed Aug 9 17:59:55[2:242:1]Wed Aug 9 18:01:55 +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Nueva misión: elegir naves
NavesDisponibles--
Nave pequeña de carga10máx
Nave grande de carga19máx
Crucero6máx
Reciclador1máx
Sonda de espionaje139máx
Ninguna naveTodas las naves
+ +

+
+ + diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index 28a21a4..b698636 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -286,21 +286,6 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ('codepoint', 'convert', 42), ]) - def test_attr_values_quoted_markup(self): - """Multi-line and markup in attribute values""" - self.check_events("""text""", - [("starttag", "a", [("title", "foo\n
bar")]), - ("data", "text"), - ("endtag", "a")]) - self.check_events("""text""", - [("starttag", "a", [("title", "less < than")]), - ("data", "text"), - ("endtag", "a")]) - self.check_events("""text""", - [("starttag", "a", [("title", "greater > than")]), - ("data", "text"), - ("endtag", "a")]) - def test_attr_funky_names(self): self.check_events("""""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), @@ -376,6 +361,19 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ('decl', 'DOCTYPE doc []'), ]) + def test_read_chunks(self): + # SF bug #1541697, this caused sgml parser to hang + # Just verify this code doesn't cause a hang. + CHUNK = 1024 # increasing this to 8212 makes the problem go away + + f = open(test_support.findfile('sgml_input.html')) + fp = sgmllib.SGMLParser() + while 1: + data = f.read(CHUNK) + fp.feed(data) + if len(data) != CHUNK: + break + # XXX These tests have been disabled by prefixing their names with # an underscore. The first two exercise outstanding bugs in the # sgmllib module, and the third exhibits questionable behavior diff --git a/Misc/NEWS b/Misc/NEWS index 34c3e7c..6f12bce 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -40,6 +40,8 @@ Core and builtins Library ------- +- Reverted patch #1504333 to sgmllib because it introduced an infinite loop. + - Patch #1553314: Fix the inspect.py slowdown that was hurting IPython & SAGE by adding smarter caching in inspect.getmodule() -- cgit v0.12