1 files changed, 88 insertions, 19 deletions
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 11d9c9c..144f820 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -70,6 +70,15 @@ class EventCollectorExtra(EventCollector):
         self.append(("starttag_text", self.get_starttag_text()))
 
 
+class EventCollectorCharrefs(EventCollector):
+
+    def handle_charref(self, data):
+        self.fail('This should never be called with convert_charrefs=True')
+
+    def handle_entityref(self, data):
+        self.fail('This should never be called with convert_charrefs=True')
+
+
 class TestCaseBase(unittest.TestCase):
 
     def get_collector(self):
@@ -84,26 +93,30 @@ class TestCaseBase(unittest.TestCase):
         parser.close()
         events = parser.get_events()
         if events != expected_events:
-            self.fail("received events did not match expected events\n"
-                      "Expected:\n" + pprint.pformat(expected_events) +
+            self.fail("received events did not match expected events" +
+                      "\nSource:\n" + repr(source) +
+                      "\nExpected:\n" + pprint.pformat(expected_events) +
                       "\nReceived:\n" + pprint.pformat(events))
 
     def _run_check_extra(self, source, events):
-        self._run_check(source, events, EventCollectorExtra())
+        self._run_check(source, events,
+                        EventCollectorExtra(convert_charrefs=False))
 
     def _parse_error(self, source):
         def parse(source=source):
             parser = self.get_collector()
             parser.feed(source)
             parser.close()
-        self.assertRaises(html.parser.HTMLParseError, parse)
+        with self.assertRaises(html.parser.HTMLParseError):
+            with self.assertWarns(DeprecationWarning):
+                parse()
 
 
 class HTMLParserStrictTestCase(TestCaseBase):
 
     def get_collector(self):
         with support.check_warnings(("", DeprecationWarning), quite=False):
-            return EventCollector(strict=True)
+            return EventCollector(strict=True, convert_charrefs=False)
 
     def test_processing_instruction_only(self):
         self._run_check("<?processing instruction>", [
@@ -339,7 +352,7 @@ text
             self._run_check(s, [("starttag", element_lower, []),
                                 ("data", content),
                                 ("endtag", element_lower)],
-                            collector=Collector())
+                            collector=Collector(convert_charrefs=False))
 
     def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
@@ -367,11 +380,60 @@ text
                     ('comment', '[if lte IE 7]>pretty?<![endif]')]
         self._run_check(html, expected)
 
+    def test_convert_charrefs(self):
+        collector = lambda: EventCollectorCharrefs(convert_charrefs=True)
+        self.assertTrue(collector().convert_charrefs)
+        charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
+        # check charrefs in the middle of the text/attributes
+        expected = [('starttag', 'a', [('href', 'foo"zar')]),
+                    ('data', 'a"z'), ('endtag', 'a')]
+        for charref in charrefs:
+            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+                            expected, collector=collector())
+        # check charrefs at the beginning/end of the text/attributes
+        expected = [('data', '"'),
+                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+                    ('data', '"'), ('endtag', 'a'), ('data', '"')]
+        for charref in charrefs:
+            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+                            '{0}</a>{0}'.format(charref),
+                            expected, collector=collector())
+        # check charrefs in <script>/<style> elements
+        for charref in charrefs:
+            text = 'X'.join([charref]*3)
+            expected = [('data', '"'),
+                        ('starttag', 'script', []), ('data', text),
+                        ('endtag', 'script'), ('data', '"'),
+                        ('starttag', 'style', []), ('data', text),
+                        ('endtag', 'style'), ('data', '"')]
+            self._run_check('{1}<script>{0}</script>{1}'
+                            '<style>{0}</style>{1}'.format(text, charref),
+                            expected, collector=collector())
+        # check truncated charrefs at the end of the file
+        html = '&quo &# &#x'
+        for x in range(1, len(html)):
+            self._run_check(html[:x], [('data', html[:x])],
+                            collector=collector())
+        # check a string with no charrefs
+        self._run_check('no charrefs here', [('data', 'no charrefs here')],
+                        collector=collector())
+
 
 class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
 
     def get_collector(self):
-        return EventCollector(strict=False)
+        return EventCollector(convert_charrefs=False)
+
+    def test_deprecation_warnings(self):
+        with self.assertWarns(DeprecationWarning):
+            EventCollector()  # convert_charrefs not passed explicitly
+        with self.assertWarns(DeprecationWarning):
+            EventCollector(strict=True)
+        with self.assertWarns(DeprecationWarning):
+            EventCollector(strict=False)
+        with self.assertRaises(html.parser.HTMLParseError):
+            with self.assertWarns(DeprecationWarning):
+                EventCollector().error('test')
 
     def test_tolerant_parsing(self):
         self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
@@ -564,17 +626,12 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
         for html, expected in data:
             self._run_check(html, expected)
 
-    def test_unescape_function(self):
+    def test_unescape_method(self):
+        from html import unescape
         p = self.get_collector()
-        self.assertEqual(p.unescape('&#bad;'),'&#bad;')
-        self.assertEqual(p.unescape('&#0038;'),'&')
-        # see #12888
-        self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
-        # see #15156
-        self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'
-                                    '&alphacentauri&alpha;centauri'),
-                                    'ÉricÉric&alphacentauriαcentauri')
-        self.assertEqual(p.unescape('&co;'), '&co;')
+        with self.assertWarns(DeprecationWarning):
+            s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
+            self.assertEqual(p.unescape(s), unescape(s))
 
     def test_broken_comments(self):
         html = ('<! not really a comment >'
@@ -625,12 +682,24 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
         ]
         self._run_check(html, expected)
 
+    def test_convert_charrefs_dropped_text(self):
+        # #23144: make sure that all the events are triggered when
+        # convert_charrefs is True, even if we don't call .close()
+        parser = EventCollector(convert_charrefs=True)
+        # before the fix, bar & baz was missing
+        parser.feed("foo <a>link</a> bar &amp; baz")
+        self.assertEqual(
+            parser.get_events(),
+            [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
+             ('endtag', 'a'), ('data', ' bar & baz')]
+        )
+
 
 class AttributesStrictTestCase(TestCaseBase):
 
     def get_collector(self):
         with support.check_warnings(("", DeprecationWarning), quite=False):
-            return EventCollector(strict=True)
+            return EventCollector(strict=True, convert_charrefs=False)
 
     def test_attr_syntax(self):
         output = [
@@ -691,7 +760,7 @@ class AttributesStrictTestCase(TestCaseBase):
 class AttributesTolerantTestCase(AttributesStrictTestCase):
 
     def get_collector(self):
-        return EventCollector(strict=False)
+        return EventCollector(convert_charrefs=False)
 
     def test_attr_funky_names2(self):
         self._run_check(