diff options
author | Fred Drake <fdrake@acm.org> | 2006-06-29 00:51:53 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2006-06-29 00:51:53 (GMT) |
commit | a136210a9fe9aff79dd3457a53eae35a823f97fc (patch) | |
tree | 27a8c98aefe6b85b90ee2cb47cb2cad610ea7091 /Lib | |
parent | 960a3f88e5f49474f046d7e9bb9133f0ecbda5a5 (diff) | |
download | cpython-a136210a9fe9aff79dd3457a53eae35a823f97fc.zip cpython-a136210a9fe9aff79dd3457a53eae35a823f97fc.tar.gz cpython-a136210a9fe9aff79dd3457a53eae35a823f97fc.tar.bz2 |
SF bug #1504333: sgmlib should allow angle brackets in quoted values
(modified patch by Sam Ruby; changed to use separate REs for start and end
tags to reduce matching cost for end tags; extended tests; updated to avoid
breaking previous changes to support IPv6 addresses in unquoted attribute
values)
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sgmllib.py | 19 | ||||
-rw-r--r-- | Lib/test/test_sgmllib.py | 15 |
2 files changed, 25 insertions, 9 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 3ab57c2..3020d11 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -29,7 +29,12 @@ starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') -endbracket = re.compile('[<>]') +starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' + r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' + r')*\s*/?\s*(?=[<>])') +endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -249,14 +254,10 @@ class SGMLParser(markupbase.ParserBase): self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - # XXX The following should skip matching quotes (' or ") - # As a shortcut way to exit, this isn't so bad, but shouldn't - # be used to locate the actual end of the start tag since the - # < or > characters may be embedded in an attribute value. - match = endbracket.search(rawdata, i+1) + match = starttag.match(rawdata, i) if not match: return -1 - j = match.start(0) + j = match.end(0) # Now parse the data between i+1 and j into a tag and attrs attrs = [] if rawdata[i:i+2] == '<>': @@ -305,10 +306,10 @@ class SGMLParser(markupbase.ParserBase): # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata - match = endbracket.search(rawdata, i+1) + match = endtag.match(rawdata, i) if not match: return -1 - j = match.start(0) + j = match.end(0) tag = rawdata[i+2:j].strip().lower() if rawdata[j] == '>': j = j+1 diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index 076df37..28a21a4 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -286,6 +286,21 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ('codepoint', 'convert', 42), ]) + def test_attr_values_quoted_markup(self): + """Multi-line and markup in attribute values""" + self.check_events("""<a title='foo\n<br>bar'>text</a>""", + [("starttag", "a", [("title", "foo\n<br>bar")]), + ("data", "text"), + ("endtag", "a")]) + self.check_events("""<a title='less < than'>text</a>""", + [("starttag", "a", [("title", "less < than")]), + ("data", "text"), + ("endtag", "a")]) + self.check_events("""<a title='greater > than'>text</a>""", + [("starttag", "a", [("title", "greater > than")]), + ("data", "text"), + ("endtag", "a")]) + def test_attr_funky_names(self): self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), |