diff options
author | Georg Brandl <georg@python.org> | 2006-04-01 08:35:18 (GMT) |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2006-04-01 08:35:18 (GMT) |
commit | 7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7 (patch) | |
tree | 85426998f3d08e048d4b0211361b7140b04bc414 /Lib | |
parent | 48d5e508ebc136f8f67a2cb2bdd29e55324f5a95 (diff) | |
download | cpython-7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7.zip cpython-7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7.tar.gz cpython-7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7.tar.bz2 |
patch #1462498: handle entityrefs in attribute values.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sgmllib.py | 34 | ||||
-rw-r--r-- | Lib/test/test_sgmllib.py | 14 |
2 files changed, 45 insertions, 3 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 08e365b..784dbe1 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -269,9 +269,37 @@ class SGMLParser(markupbase.ParserBase): attrname, rest, attrvalue = match.group(1, 2, 3) if not rest: attrvalue = attrname - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] + else: + if (attrvalue[:1] == "'" == attrvalue[-1:] or + attrvalue[:1] == '"' == attrvalue[-1:]): + # strip quotes + attrvalue = attrvalue[1:-1] + l = 0 + new_attrvalue = '' + while l < len(attrvalue): + av_match = entityref.match(attrvalue, l) + if (av_match and av_match.group(1) in self.entitydefs and + attrvalue[av_match.end(1)] == ';'): + # only substitute entityrefs ending in ';' since + # otherwise we may break <a href='?p=x&q=y'> + # which is very common + new_attrvalue += self.entitydefs[av_match.group(1)] + l = av_match.end(0) + continue + ch_match = charref.match(attrvalue, l) + if ch_match: + try: + char = chr(int(ch_match.group(1))) + new_attrvalue += char + l = ch_match.end(0) + continue + except ValueError: + # invalid character reference, don't substitute + pass + # all other cases + new_attrvalue += attrvalue[l] + l += 1 + attrvalue = new_attrvalue attrs.append((attrname.lower(), attrvalue)) k = match.end(0) if rawdata[j] == '>': diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index bc25bd0..8e8b02f 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -214,6 +214,20 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ("starttag", "e", [("a", "rgb(1,2,3)")]), ]) + def test_attr_values_entities(self): + """Substitution of entities and charrefs in attribute values""" + # SF bug #1452246 + self.check_events("""<a b=< c=<> d=<-> e='< ' + f="&xxx;" g=' !' h='Ǵ' i='x?a=b&c=d;'>""", + [("starttag", "a", [("b", "<"), + ("c", "<>"), + ("d", "<->"), + ("e", "< "), + ("f", "&xxx;"), + ("g", " !"), + ("h", "Ǵ"), + ("i", "x?a=b&c=d;"), ])]) + def test_attr_funky_names(self): self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), |