diff options
author | Fred Drake <fdrake@acm.org> | 2006-06-23 06:03:45 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2006-06-23 06:03:45 (GMT) |
commit | 2f99da636b3c809977405ee6220ad7ea822d7dd3 (patch) | |
tree | d7851b84f47360d17ee4882b9b88e33a98a85669 | |
parent | b114984225c0371a21ce44e037abb452e36e2a6d (diff) | |
download | cpython-2f99da636b3c809977405ee6220ad7ea822d7dd3.zip cpython-2f99da636b3c809977405ee6220ad7ea822d7dd3.tar.gz cpython-2f99da636b3c809977405ee6220ad7ea822d7dd3.tar.bz2 |
- SF bug #853506: IP6 address parsing in sgmllib
('[' and ']' were not accepted in unquoted attribute values)
- cleaned up tests of character and entity reference decoding so the
tests cover the documented relationships among handle_charref,
handle_entityref, convert_charref, convert_codepoint, and
convert_entityref, without bringing up Unicode issues that sgmllib
cannot be involved in
-rw-r--r-- | Lib/sgmllib.py | 6 | ||||
-rw-r--r-- | Lib/test/test_sgmllib.py | 50 |
2 files changed, 45 insertions, 11 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 194396b..3ab57c2 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -33,7 +33,7 @@ endbracket = re.compile('[<>]') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') class SGMLParseError(RuntimeError): @@ -400,11 +400,11 @@ class SGMLParser(markupbase.ParserBase): def handle_charref(self, name): """Handle character reference, no need to override.""" - replacement = convert_charref(name) + replacement = self.convert_charref(name) if replacement is None: self.unknown_charref(name) else: - self.handle_data(convert_charref(name)) + self.handle_data(replacement) # Definition of entities -- derived classes may override entitydefs = \ diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index 31b54de..076df37 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -1,4 +1,6 @@ +import htmlentitydefs import pprint +import re import sgmllib import unittest from test import test_support @@ -65,20 +67,34 @@ class CDATAEventCollector(EventCollector): class HTMLEntityCollector(EventCollector): - import re, htmlentitydefs + entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)' '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)') def convert_charref(self, name): self.append(("charref", "convert", name)) - if name.startswith('x'): - return unichr(int(name[1:],16)) - else: - return unichr(int(name)) + if name[0] != "x": + return EventCollector.convert_charref(self, name) + + def convert_codepoint(self, codepoint): + self.append(("codepoint", "convert", codepoint)) + EventCollector.convert_codepoint(self, codepoint) def convert_entityref(self, name): self.append(("entityref", "convert", name)) - return unichr(self.htmlentitydefs.name2codepoint[name]) + return EventCollector.convert_entityref(self, name) + + # These to record that they were called, then pass the call along + # to the default implementation so that it's actions can be + # recorded. + + def handle_charref(self, data): + self.append(("charref", data)) + sgmllib.SGMLParser.handle_charref(self, data) + + def handle_entityref(self, data): + self.append(("entityref", data)) + sgmllib.SGMLParser.handle_entityref(self, data) class SGMLParserTestCase(unittest.TestCase): @@ -251,13 +267,23 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ])]) def test_convert_overrides(self): + # This checks that the character and entity reference + # conversion helpers are called at the documented times. No + # attempt is made to really change what the parser accepts. + # self.collector = HTMLEntityCollector - self.check_events('<a title="“test”">foo</a>', [ + self.check_events(('<a title="“test”">foo</a>' + '&foobar;*'), [ ('entityref', 'convert', 'ldquo'), ('charref', 'convert', 'x201d'), - ('starttag', 'a', [('title', u'\u201ctest\u201d')]), + ('starttag', 'a', [('title', '“test”')]), ('data', 'foo'), ('endtag', 'a'), + ('entityref', 'foobar'), + ('entityref', 'convert', 'foobar'), + ('charref', '42'), + ('charref', 'convert', '42'), + ('codepoint', 'convert', 42), ]) def test_attr_funky_names(self): @@ -265,6 +291,14 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), ]) + def test_attr_value_ip6_url(self): + # http://www.python.org/sf/853506 + self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>" + "<a href=http://[1080::8:800:200C:417A]/>"), [ + ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), + ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), + ]) + def test_illegal_declarations(self): s = 'abc<!spacer type="block" height="25">def' self.check_events(s, [ |