From ab8a6bba250b35ea87d8976e9cd4dd74e57cfd0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Tue, 6 Mar 2007 14:43:00 +0000 Subject: Patch #912410: Replace HTML entity references for attribute values in HTMLParser. --- Doc/lib/libhtmlparser.tex | 18 +++++++++++------- Lib/HTMLParser.py | 30 ++++++++++++++++++++++++------ Lib/test/test_htmlparser.py | 5 +++++ Misc/NEWS | 3 +++ 4 files changed, 43 insertions(+), 13 deletions(-) diff --git a/Doc/lib/libhtmlparser.tex b/Doc/lib/libhtmlparser.tex index 52f8409..5e99f27 100644 --- a/Doc/lib/libhtmlparser.tex +++ b/Doc/lib/libhtmlparser.tex @@ -75,14 +75,18 @@ This method is called to handle the start of a tag. It is intended to be overridden by a derived class; the base class implementation does nothing. -The \var{tag} argument is the name of the tag converted to -lower case. The \var{attrs} argument is a list of \code{(\var{name}, -\var{value})} pairs containing the attributes found inside the tag's -\code{<>} brackets. The \var{name} will be translated to lower case -and double quotes and backslashes in the \var{value} have been -interpreted. For instance, for the tag \code{}, this method would be called as +The \var{tag} argument is the name of the tag converted to lower case. +The \var{attrs} argument is a list of \code{(\var{name}, \var{value})} +pairs containing the attributes found inside the tag's \code{<>} +brackets. The \var{name} will be translated to lower case, and quotes +in the \var{value} have been removed, and character and entity +references have been replaced. For instance, for the tag \code{}, this method would be called as \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}. + +\versionchanged[All entity references from htmlentitydefs are now +replaced in the attribute values]{2.6} + \end{methoddesc} \begin{methoddesc}{handle_startendtag}{tag, attrs} diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 8380466..2cbc2ec 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -358,12 +358,30 @@ class HTMLParser(markupbase.ParserBase): self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting + entitydefs = None def unescape(self, s): if '&' not in s: return s - s = s.replace("<", "<") - s = s.replace(">", ">") - s = s.replace("'", "'") - s = s.replace(""", '"') - s = s.replace("&", "&") # Must be last - return s + def replaceEntities(s): + s = s.groups()[0] + if s[0] == "#": + s = s[1:] + if s[0] in ['x','X']: + c = int(s[1:], 16) + else: + c = int(s) + return unichr(c) + else: + # Cannot use name2codepoint directly, because HTMLParser supports apos, + # which is not part of HTML 4 + import htmlentitydefs + if HTMLParser.entitydefs is None: + entitydefs = HTMLParser.entitydefs = {'apos':u"'"} + for k, v in htmlentitydefs.name2codepoint.iteritems(): + entitydefs[k] = unichr(v) + try: + return self.entitydefs[s] + except KeyError: + return '&'+s+';' + + return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 54b90cd..229bbed 100755 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -309,6 +309,11 @@ DOCTYPE html [ ("endtag", "script"), ]) + def test_entityrefs_in_attributes(self): + self._run_check("", [ + ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")]) + ]) + def test_main(): test_support.run_unittest(HTMLParserTestCase) diff --git a/Misc/NEWS b/Misc/NEWS index 5ba2522..9f3ba9d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -141,6 +141,9 @@ Core and builtins Library ------- +- Patch #912410: Replace HTML entity references for attribute values + in HTMLParser. + - Patch #1663234: you can now run doctest on test files and modules using "python -m doctest [-v] filename ...". -- cgit v0.12