diff options
| author | Neal Norwitz <nnorwitz@gmail.com> | 2006-09-11 04:05:18 (GMT) |
|---|---|---|
| committer | Neal Norwitz <nnorwitz@gmail.com> | 2006-09-11 04:05:18 (GMT) |
| commit | 48829ba61d840cfcc6be63219b62476b4f9c7f7d (patch) | |
| tree | 69ab7600924a3a6dd3ad897117ced744dff8d76d /Lib/sgmllib.py | |
| parent | 2e488fdebf109a13a464dbe76f85b8dbf846c7f7 (diff) | |
| download | cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.zip cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.tar.gz cpython-48829ba61d840cfcc6be63219b62476b4f9c7f7d.tar.bz2 | |
As mentioned on python-dev, reverting patch #1504333 because it introduced
an infinite loop in rev 47154.
This patch also adds a test to prevent the regression.
Will backport to 2.4 and head later.
Diffstat (limited to 'Lib/sgmllib.py')
| -rw-r--r-- | Lib/sgmllib.py | 19 |
1 files changed, 9 insertions, 10 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 3020d11..3ab57c2 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -29,12 +29,7 @@ starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') -starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' - r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' - r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' - r')*\s*/?\s*(?=[<>])') -endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])') +endbracket = re.compile('[<>]') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -254,10 +249,14 @@ class SGMLParser(markupbase.ParserBase): self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - match = starttag.match(rawdata, i) + # XXX The following should skip matching quotes (' or ") + # As a shortcut way to exit, this isn't so bad, but shouldn't + # be used to locate the actual end of the start tag since the + # < or > characters may be embedded in an attribute value. + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) # Now parse the data between i+1 and j into a tag and attrs attrs = [] if rawdata[i:i+2] == '<>': @@ -306,10 +305,10 @@ class SGMLParser(markupbase.ParserBase): # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata - match = endtag.match(rawdata, i) + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) tag = rawdata[i+2:j].strip().lower() if rawdata[j] == '>': j = j+1 |
