#13633: Added a new convert_charrefs keyword arg to HTMLParser that, when True, automatically converts all character references.

author: Ezio Melotti <ezio.melotti@gmail.com> 2013-11-23 17:52:05 (GMT)
committer: Ezio Melotti <ezio.melotti@gmail.com> 2013-11-23 17:52:05 (GMT)
commit: 95401c5f6b9f07b094924559177c9b30a1c38998 (patch)
tree: 3029ea3bbffc0c53c64275a2e587bbf696a740cb /Lib
parent: e7f87e12626d6ae3b9ed8cae8904a6afad580ffc (diff)
download: cpython-95401c5f6b9f07b094924559177c9b30a1c38998.zip
cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.gz
cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.bz2
2 files changed, 107 insertions, 25 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index a228e8e..12c28b8 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -97,7 +97,7 @@ class HTMLParseError(Exception):
         return result
 
 
-_strict_sentinel = object()
+_default_sentinel = object()
 
 class HTMLParser(_markupbase.ParserBase):
     """Find tags and other markup and call handler functions.
@@ -112,28 +112,39 @@ class HTMLParser(_markupbase.ParserBase):
     self.handle_startendtag(); end tags by self.handle_endtag().  The
     data between tags is passed from the parser to the derived class
     by calling self.handle_data() with the data as argument (the data
-    may be split up in arbitrary chunks).  Entity references are
-    passed by calling self.handle_entityref() with the entity
-    reference as the argument.  Numeric character references are
-    passed to self.handle_charref() with the string containing the
-    reference as the argument.
+    may be split up in arbitrary chunks).  If convert_charrefs is
+    True the character references are converted automatically to the
+    corresponding Unicode character (and self.handle_data() is no
+    longer split in chunks), otherwise they are passed by calling
+    self.handle_entityref() or self.handle_charref() with the string
+    containing respectively the named or numeric reference as the
+    argument.
     """
 
     CDATA_CONTENT_ELEMENTS = ("script", "style")
 
-    def __init__(self, strict=_strict_sentinel):
+    def __init__(self, strict=_default_sentinel, *,
+                 convert_charrefs=_default_sentinel):
         """Initialize and reset this instance.
 
+        If convert_charrefs is True (default: False), all character references
+        are automatically converted to the corresponding Unicode characters.
         If strict is set to False (the default) the parser will parse invalid
         markup, otherwise it will raise an error.  Note that the strict mode
         and argument are deprecated.
         """
-        if strict is not _strict_sentinel:
+        if strict is not _default_sentinel:
             warnings.warn("The strict argument and mode are deprecated.",
                           DeprecationWarning, stacklevel=2)
         else:
             strict = False  # default
         self.strict = strict
+        if convert_charrefs is _default_sentinel:
+            convert_charrefs = False  # default
+            warnings.warn("The value of convert_charrefs will become True in "
+                          "3.5. You are encouraged to set the value explicitly.",
+                          DeprecationWarning, stacklevel=2)
+        self.convert_charrefs = convert_charrefs
         self.reset()
 
     def reset(self):
@@ -184,14 +195,25 @@ class HTMLParser(_markupbase.ParserBase):
         i = 0
         n = len(rawdata)
         while i < n:
-            match = self.interesting.search(rawdata, i) # < or &
-            if match:
-                j = match.start()
+            if self.convert_charrefs and not self.cdata_elem:
+                j = rawdata.find('<', i)
+                if j < 0:
+                    if not end:
+                        break  # wait till we get all the text
+                    j = n
             else:
-                if self.cdata_elem:
-                    break
-                j = n
-            if i < j: self.handle_data(rawdata[i:j])
+                match = self.interesting.search(rawdata, i)  # < or &
+                if match:
+                    j = match.start()
+                else:
+                    if self.cdata_elem:
+                        break
+                    j = n
+            if i < j:
+                if self.convert_charrefs and not self.cdata_elem:
+                    self.handle_data(unescape(rawdata[i:j]))
+                else:
+                    self.handle_data(rawdata[i:j])
             i = self.updatepos(i, j)
             if i == n: break
             startswith = rawdata.startswith
@@ -226,7 +248,10 @@ class HTMLParser(_markupbase.ParserBase):
                             k = i + 1
                     else:
                         k += 1
-                    self.handle_data(rawdata[i:k])
+                    if self.convert_charrefs and not self.cdata_elem:
+                        self.handle_data(unescape(rawdata[i:k]))
+                    else:
+                        self.handle_data(rawdata[i:k])
                 i = self.updatepos(i, k)
             elif startswith("&#", i):
                 match = charref.match(rawdata, i)
@@ -277,7 +302,10 @@ class HTMLParser(_markupbase.ParserBase):
                 assert 0, "interesting.search() lied"
         # end while
         if end and i < n and not self.cdata_elem:
-            self.handle_data(rawdata[i:n])
+            if self.convert_charrefs and not self.cdata_elem:
+                self.handle_data(unescape(rawdata[i:n]))
+            else:
+                self.handle_data(rawdata[i:n])
             i = self.updatepos(i, n)
         self.rawdata = rawdata[i:]
 
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 509b3cd..1a480c8 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector):
         self.append(("starttag_text", self.get_starttag_text()))
 
 
+class EventCollectorCharrefs(EventCollector):
+
+    def get_events(self):
+        return self.events
+
+    def handle_charref(self, data):
+        self.fail('This should never be called with convert_charrefs=True')
+
+    def handle_entityref(self, data):
+        self.fail('This should never be called with convert_charrefs=True')
+
+
 class TestCaseBase(unittest.TestCase):
 
     def get_collector(self):
@@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase):
         parser.close()
         events = parser.get_events()
         if events != expected_events:
-            self.fail("received events did not match expected events\n"
-                      "Expected:\n" + pprint.pformat(expected_events) +
+            self.fail("received events did not match expected events" +
+                      "\nSource:\n" + repr(source) +
+                      "\nExpected:\n" + pprint.pformat(expected_events) +
                       "\nReceived:\n" + pprint.pformat(events))
 
     def _run_check_extra(self, source, events):
-        self._run_check(source, events, EventCollectorExtra())
+        self._run_check(source, events,
+                        EventCollectorExtra(convert_charrefs=False))
 
     def _parse_error(self, source):
         def parse(source=source):
@@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseBase):
 
     def get_collector(self):
         with support.check_warnings(("", DeprecationWarning), quite=False):
-            return EventCollector(strict=True)
+            return EventCollector(strict=True, convert_charrefs=False)
 
     def test_processing_instruction_only(self):
         self._run_check("<?processing instruction>", [
@@ -335,7 +349,7 @@ text
             self._run_check(s, [("starttag", element_lower, []),
                                 ("data", content),
                                 ("endtag", element_lower)],
-                            collector=Collector())
+                            collector=Collector(convert_charrefs=False))
 
     def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
@@ -363,14 +377,54 @@ text
                     ('comment', '[if lte IE 7]>pretty?<![endif]')]
         self._run_check(html, expected)
 
+    def test_convert_charrefs(self):
+        collector = lambda: EventCollectorCharrefs(convert_charrefs=True)
+        self.assertTrue(collector().convert_charrefs)
+        charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
+        # check charrefs in the middle of the text/attributes
+        expected = [('starttag', 'a', [('href', 'foo"zar')]),
+                    ('data', 'a"z'), ('endtag', 'a')]
+        for charref in charrefs:
+            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+                            expected, collector=collector())
+        # check charrefs at the beginning/end of the text/attributes
+        expected = [('data', '"'),
+                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+                    ('data', '"'), ('endtag', 'a'), ('data', '"')]
+        for charref in charrefs:
+            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+                            '{0}</a>{0}'.format(charref),
+                            expected, collector=collector())
+        # check charrefs in <script>/<style> elements
+        for charref in charrefs:
+            text = 'X'.join([charref]*3)
+            expected = [('data', '"'),
+                        ('starttag', 'script', []), ('data', text),
+                        ('endtag', 'script'), ('data', '"'),
+                        ('starttag', 'style', []), ('data', text),
+                        ('endtag', 'style'), ('data', '"')]
+            self._run_check('{1}<script>{0}</script>{1}'
+                            '<style>{0}</style>{1}'.format(text, charref),
+                            expected, collector=collector())
+        # check truncated charrefs at the end of the file
+        html = '&quo &# &#x'
+        for x in range(1, len(html)):
+            self._run_check(html[:x], [('data', html[:x])],
+                            collector=collector())
+        # check a string with no charrefs
+        self._run_check('no charrefs here', [('data', 'no charrefs here')],
+                        collector=collector())
+
 
 class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
 
     def get_collector(self):
-        return EventCollector()
+        return EventCollector(convert_charrefs=False)
 
     def test_deprecation_warnings(self):
         with self.assertWarns(DeprecationWarning):
+            EventCollector()  # convert_charrefs not passed explicitly
+        with self.assertWarns(DeprecationWarning):
             EventCollector(strict=True)
         with self.assertWarns(DeprecationWarning):
             EventCollector(strict=False)
@@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseBase):
 
     def get_collector(self):
         with support.check_warnings(("", DeprecationWarning), quite=False):
-            return EventCollector(strict=True)
+            return EventCollector(strict=True, convert_charrefs=False)
 
     def test_attr_syntax(self):
         output = [
@@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseBase):
 class AttributesTolerantTestCase(AttributesStrictTestCase):
 
     def get_collector(self):
-        return EventCollector()
+        return EventCollector(convert_charrefs=False)
 
     def test_attr_funky_names2(self):
         self._run_check(
author	Ezio Melotti <ezio.melotti@gmail.com>	2013-11-23 17:52:05 (GMT)
committer	Ezio Melotti <ezio.melotti@gmail.com>	2013-11-23 17:52:05 (GMT)
commit	95401c5f6b9f07b094924559177c9b30a1c38998 (patch)
tree	3029ea3bbffc0c53c64275a2e587bbf696a740cb /Lib
parent	e7f87e12626d6ae3b9ed8cae8904a6afad580ffc (diff)
download	cpython-95401c5f6b9f07b094924559177c9b30a1c38998.zip cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.gz cpython-95401c5f6b9f07b094924559177c9b30a1c38998.tar.bz2