diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2007-03-06 14:43:00 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2007-03-06 14:43:00 (GMT) |
commit | ab8a6bba250b35ea87d8976e9cd4dd74e57cfd0a (patch) | |
tree | 6e581be3225031d7cc1e1267881a64cca00243a7 /Lib | |
parent | ff432e6f4ad8e4430ce984ec883a3d038e1c7ab9 (diff) | |
download | cpython-ab8a6bba250b35ea87d8976e9cd4dd74e57cfd0a.zip cpython-ab8a6bba250b35ea87d8976e9cd4dd74e57cfd0a.tar.gz cpython-ab8a6bba250b35ea87d8976e9cd4dd74e57cfd0a.tar.bz2 |
Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/HTMLParser.py | 30 | ||||
-rwxr-xr-x | Lib/test/test_htmlparser.py | 5 |
2 files changed, 29 insertions, 6 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 8380466..2cbc2ec 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -358,12 +358,30 @@ class HTMLParser(markupbase.ParserBase): self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting + entitydefs = None def unescape(self, s): if '&' not in s: return s - s = s.replace("<", "<") - s = s.replace(">", ">") - s = s.replace("'", "'") - s = s.replace(""", '"') - s = s.replace("&", "&") # Must be last - return s + def replaceEntities(s): + s = s.groups()[0] + if s[0] == "#": + s = s[1:] + if s[0] in ['x','X']: + c = int(s[1:], 16) + else: + c = int(s) + return unichr(c) + else: + # Cannot use name2codepoint directly, because HTMLParser supports apos, + # which is not part of HTML 4 + import htmlentitydefs + if HTMLParser.entitydefs is None: + entitydefs = HTMLParser.entitydefs = {'apos':u"'"} + for k, v in htmlentitydefs.name2codepoint.iteritems(): + entitydefs[k] = unichr(v) + try: + return self.entitydefs[s] + except KeyError: + return '&'+s+';' + + return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 54b90cd..229bbed 100755 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -309,6 +309,11 @@ DOCTYPE html [ ("endtag", "script"), ]) + def test_entityrefs_in_attributes(self): + self._run_check("<html foo='€&aa&unsupported;'>", [ + ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")]) + ]) + def test_main(): test_support.run_unittest(HTMLParserTestCase) |