diff options
author | Georg Brandl <georg@python.org> | 2006-04-01 08:35:18 (GMT) |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2006-04-01 08:35:18 (GMT) |
commit | 7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7 (patch) | |
tree | 85426998f3d08e048d4b0211361b7140b04bc414 | |
parent | 48d5e508ebc136f8f67a2cb2bdd29e55324f5a95 (diff) | |
download | cpython-7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7.zip cpython-7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7.tar.gz cpython-7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7.tar.bz2 |
patch #1462498: handle entityrefs in attribute values.
-rw-r--r-- | Doc/lib/libsgmllib.tex | 7 | ||||
-rw-r--r-- | Lib/sgmllib.py | 34 | ||||
-rw-r--r-- | Lib/test/test_sgmllib.py | 14 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
4 files changed, 53 insertions, 5 deletions
diff --git a/Doc/lib/libsgmllib.tex b/Doc/lib/libsgmllib.tex index 27bf0b0..592c191 100644 --- a/Doc/lib/libsgmllib.tex +++ b/Doc/lib/libsgmllib.tex @@ -95,12 +95,15 @@ lower case, and the \var{method} argument is the bound method which should be used to support semantic interpretation of the start tag. The \var{attributes} argument is a list of \code{(\var{name}, \var{value})} pairs containing the attributes found inside the tag's -\code{<>} brackets. The \var{name} has been translated to lower case -and double quotes and backslashes in the \var{value} have been interpreted. +\code{<>} brackets. The \var{name} has been translated to lower case. +Double quotes and backslashes in the \var{value} have been interpreted, +as well as known entity and character references. For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this method would be called as \samp{unknown_starttag('a', [('href', 'http://www.cwi.nl/')])}. The base implementation simply calls \var{method} with \var{attributes} as the only argument. +\versionadded[Handling of entity and character references within + attribute values]{2.5} \end{methoddesc} \begin{methoddesc}{handle_endtag}{tag, method} diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 08e365b..784dbe1 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -269,9 +269,37 @@ class SGMLParser(markupbase.ParserBase): attrname, rest, attrvalue = match.group(1, 2, 3) if not rest: attrvalue = attrname - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] + else: + if (attrvalue[:1] == "'" == attrvalue[-1:] or + attrvalue[:1] == '"' == attrvalue[-1:]): + # strip quotes + attrvalue = attrvalue[1:-1] + l = 0 + new_attrvalue = '' + while l < len(attrvalue): + av_match = entityref.match(attrvalue, l) + if (av_match and av_match.group(1) in self.entitydefs and + attrvalue[av_match.end(1)] == ';'): + # only substitute entityrefs ending in ';' since + # otherwise we may break <a href='?p=x&q=y'> + # which is very common + new_attrvalue += self.entitydefs[av_match.group(1)] + l = av_match.end(0) + continue + ch_match = charref.match(attrvalue, l) + if ch_match: + try: + char = chr(int(ch_match.group(1))) + new_attrvalue += char + l = ch_match.end(0) + continue + except ValueError: + # invalid character reference, don't substitute + pass + # all other cases + new_attrvalue += attrvalue[l] + l += 1 + attrvalue = new_attrvalue attrs.append((attrname.lower(), attrvalue)) k = match.end(0) if rawdata[j] == '>': diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index bc25bd0..8e8b02f 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -214,6 +214,20 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ("starttag", "e", [("a", "rgb(1,2,3)")]), ]) + def test_attr_values_entities(self): + """Substitution of entities and charrefs in attribute values""" + # SF bug #1452246 + self.check_events("""<a b=< c=<> d=<-> e='< ' + f="&xxx;" g=' !' h='Ǵ' i='x?a=b&c=d;'>""", + [("starttag", "a", [("b", "<"), + ("c", "<>"), + ("d", "<->"), + ("e", "< "), + ("f", "&xxx;"), + ("g", " !"), + ("h", "Ǵ"), + ("i", "x?a=b&c=d;"), ])]) + def test_attr_funky_names(self): self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), @@ -489,6 +489,9 @@ Extension Modules Library ------- +- Patch #1462498: sgmllib now handles entity and character references + in attribute values. + - Added the sqlite3 package. This is based on pysqlite2.1.3, and provides a DB-API interface in the standard library. You'll need sqlite 3.2.2 or later to build this - if you have an earlier version, the C extension |