diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-05-02 09:37:48 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-02 09:37:48 (GMT) |
commit | 18b07d773e09a2719e69aeaa925d5abb7ba0c068 (patch) | |
tree | 24e00867bcd614057ca886b2e977153d6168fa59 /Lib | |
parent | 614420df9796c8a4f01e24052fc0128b4c20c5bf (diff) | |
download | cpython-18b07d773e09a2719e69aeaa925d5abb7ba0c068.zip cpython-18b07d773e09a2719e69aeaa925d5abb7ba0c068.tar.gz cpython-18b07d773e09a2719e69aeaa925d5abb7ba0c068.tar.bz2 |
bpo-36819: Fix crashes in built-in encoders with weird error handlers (GH-28593)
If the error handler returns position less or equal than the starting
position of non-encodable characters, most of built-in encoders didn't
properly re-size the output buffer. This led to out-of-bounds writes,
and segfaults.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/test/test_codeccallbacks.py | 177 |
1 files changed, 168 insertions, 9 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 243f002..4991330 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1,5 +1,6 @@ import codecs import html.entities +import itertools import sys import unicodedata import unittest @@ -22,6 +23,18 @@ class PosReturn: self.pos = len(exc.object) return ("<?>", oldpos) +class RepeatedPosReturn: + def __init__(self, repl="<?>"): + self.repl = repl + self.pos = 0 + self.count = 0 + + def handle(self, exc): + if self.count > 0: + self.count -= 1 + return (self.repl, self.pos) + return (self.repl, exc.end) + # A UnicodeEncodeError object with a bad start attribute class BadStartUnicodeEncodeError(UnicodeEncodeError): def __init__(self): @@ -783,20 +796,104 @@ class CodecCallbackTest(unittest.TestCase): codecs.lookup_error("namereplace") ) - def test_unencodablereplacement(self): + def test_encode_nonascii_replacement(self): + def handle(exc): + if isinstance(exc, UnicodeEncodeError): + return (repl, exc.end) + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error("test.replacing", handle) + + for enc, input, repl in ( + ("ascii", "[¤]", "abc"), + ("iso-8859-1", "[€]", "½¾"), + ("iso-8859-15", "[¤]", "œŸ"), + ): + res = input.encode(enc, "test.replacing") + self.assertEqual(res, ("[" + repl + "]").encode(enc)) + + for enc, input, repl in ( + ("utf-8", "[\udc80]", "\U0001f40d"), + ("utf-16", "[\udc80]", "\U0001f40d"), + ("utf-32", "[\udc80]", "\U0001f40d"), + ): + with self.subTest(encoding=enc): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.replacing") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) + + def test_encode_unencodable_replacement(self): def unencrepl(exc): if isinstance(exc, UnicodeEncodeError): - return ("\u4242", exc.end) + return (repl, exc.end) else: raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.unencreplhandler", unencrepl) - for enc in ("ascii", "iso-8859-1", "iso-8859-15"): - self.assertRaises( - UnicodeEncodeError, - "\u4242".encode, - enc, - "test.unencreplhandler" - ) + + for enc, input, repl in ( + ("ascii", "[¤]", "½"), + ("iso-8859-1", "[€]", "œ"), + ("iso-8859-15", "[¤]", "½"), + ("utf-8", "[\udc80]", "\udcff"), + ("utf-16", "[\udc80]", "\udcff"), + ("utf-32", "[\udc80]", "\udcff"), + ): + with self.subTest(encoding=enc): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.unencreplhandler") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) + + def test_encode_bytes_replacement(self): + def handle(exc): + if isinstance(exc, UnicodeEncodeError): + return (repl, exc.end) + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error("test.replacing", handle) + + # It works even if the bytes sequence is not decodable. + for enc, input, repl in ( + ("ascii", "[¤]", b"\xbd\xbe"), + ("iso-8859-1", "[€]", b"\xbd\xbe"), + ("iso-8859-15", "[¤]", b"\xbd\xbe"), + ("utf-8", "[\udc80]", b"\xbd\xbe"), + ("utf-16le", "[\udc80]", b"\xbd\xbe"), + ("utf-16be", "[\udc80]", b"\xbd\xbe"), + ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"), + ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"), + ): + with self.subTest(encoding=enc): + res = input.encode(enc, "test.replacing") + self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc)) + + def test_encode_odd_bytes_replacement(self): + def handle(exc): + if isinstance(exc, UnicodeEncodeError): + return (repl, exc.end) + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error("test.replacing", handle) + + input = "[\udc80]" + # Tests in which the replacement bytestring contains not whole number + # of code units. + for enc, repl in ( + *itertools.product(("utf-16le", "utf-16be"), + [b"a", b"abc"]), + *itertools.product(("utf-32le", "utf-32be"), + [b"a", b"ab", b"abc", b"abcde"]), + ): + with self.subTest(encoding=enc, repl=repl): + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.replacing") + exc = cm.exception + self.assertEqual(exc.start, 1) + self.assertEqual(exc.end, 2) + self.assertEqual(exc.object, input) + self.assertEqual(exc.reason, "surrogates not allowed") def test_badregistercall(self): # enhance coverage of: @@ -940,6 +1037,68 @@ class CodecCallbackTest(unittest.TestCase): self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) + def test_decodehelper_bug36819(self): + handler = RepeatedPosReturn("x") + codecs.register_error("test.bug36819", handler.handle) + + testcases = [ + ("ascii", b"\xff"), + ("utf-8", b"\xff"), + ("utf-16be", b'\xdc\x80'), + ("utf-32be", b'\x00\x00\xdc\x80'), + ("iso-8859-6", b"\xff"), + ] + for enc, bad in testcases: + input = "abcd".encode(enc) + bad + with self.subTest(encoding=enc): + handler.count = 50 + decoded = input.decode(enc, "test.bug36819") + self.assertEqual(decoded, 'abcdx' * 51) + + def test_encodehelper_bug36819(self): + handler = RepeatedPosReturn() + codecs.register_error("test.bug36819", handler.handle) + + input = "abcd\udc80" + encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in + encodings += ["iso-8859-15"] # charmap codec + if sys.platform == 'win32': + encodings = ["mbcs", "oem"] # code page codecs + + handler.repl = "\udcff" + for enc in encodings: + with self.subTest(encoding=enc): + handler.count = 50 + with self.assertRaises(UnicodeEncodeError) as cm: + input.encode(enc, "test.bug36819") + exc = cm.exception + self.assertEqual(exc.start, 4) + self.assertEqual(exc.end, 5) + self.assertEqual(exc.object, input) + if sys.platform == "win32": + handler.count = 50 + with self.assertRaises(UnicodeEncodeError) as cm: + codecs.code_page_encode(437, input, "test.bug36819") + exc = cm.exception + self.assertEqual(exc.start, 4) + self.assertEqual(exc.end, 5) + self.assertEqual(exc.object, input) + + handler.repl = "x" + for enc in encodings: + with self.subTest(encoding=enc): + # The interpreter should segfault after a handful of attempts. + # 50 was chosen to try to ensure a segfault without a fix, + # but not OOM a machine with one. + handler.count = 50 + encoded = input.encode(enc, "test.bug36819") + self.assertEqual(encoded.decode(enc), "abcdx" * 51) + if sys.platform == "win32": + handler.count = 50 + encoded = codecs.code_page_encode(437, input, "test.bug36819") + self.assertEqual(encoded[0].decode(), "abcdx" * 51) + self.assertEqual(encoded[1], len(input)) + def test_translatehelper(self): # enhance coverage of: # Objects/unicodeobject.c::unicode_encode_call_errorhandler() |