From 98d156b2b210059a9c1ed50ad7a1b09e6dae75ba Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 15 Mar 2015 23:41:37 +0200 Subject: Increased coverage of standard codec error handlers. --- Lib/test/test_codeccallbacks.py | 234 +++++++++++++++++++++++++++------------- 1 file changed, 158 insertions(+), 76 deletions(-) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 84804bb..54277de 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -6,14 +6,6 @@ import unicodedata import unittest import warnings -try: - import ctypes -except ImportError: - ctypes = None - SIZEOF_WCHAR_T = -1 -else: - SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) - class PosReturn: # this can be used for configurable callbacks @@ -212,14 +204,12 @@ class CodecCallbackTest(unittest.TestCase): b"\x00\x00\x00\x00\x00".decode, "unicode-internal", ) - if SIZEOF_WCHAR_T == 4: - def handler_unicodeinternal(exc): - if not isinstance(exc, UnicodeDecodeError): - raise TypeError("don't know how to handle %r" % exc) - return ("\x01", 1) - - with test.support.check_warnings(('unicode_internal codec has been ' - 'deprecated', DeprecationWarning)): + if len('\0'.encode('unicode-internal')) == 4: + def handler_unicodeinternal(exc): + if not isinstance(exc, UnicodeDecodeError): + raise TypeError("don't know how to handle %r" % exc) + return ("\x01", 1) + self.assertEqual( b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), "\u0000" @@ -364,12 +354,11 @@ class CodecCallbackTest(unittest.TestCase): ["ascii", "\uffffx", 0, 1, "ouch"], "'ascii' codec can't encode character '\\uffff' in position 0: ouch" ) - if SIZEOF_WCHAR_T == 4: - self.check_exceptionobjectargs( - UnicodeEncodeError, - ["ascii", "\U00010000x", 0, 1, "ouch"], - "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" - ) + self.check_exceptionobjectargs( + UnicodeEncodeError, + ["ascii", "\U00010000x", 0, 1, "ouch"], + "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" + ) def test_unicodedecodeerror(self): self.check_exceptionobjectargs( @@ -399,12 +388,11 @@ class CodecCallbackTest(unittest.TestCase): ["g\uffffrk", 1, 2, "ouch"], "can't translate character '\\uffff' in position 1: ouch" ) - if SIZEOF_WCHAR_T == 4: - self.check_exceptionobjectargs( - UnicodeTranslateError, - ["g\U00010000rk", 1, 2, "ouch"], - "can't translate character '\\U00010000' in position 1: ouch" - ) + self.check_exceptionobjectargs( + UnicodeTranslateError, + ["g\U00010000rk", 1, 2, "ouch"], + "can't translate character '\\U00010000' in position 1: ouch" + ) self.check_exceptionobjectargs( UnicodeTranslateError, ["g\xfcrk", 1, 3, "ouch"], @@ -431,6 +419,16 @@ class CodecCallbackTest(unittest.TestCase): codecs.strict_errors, UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch") ) + self.assertRaises( + UnicodeDecodeError, + codecs.strict_errors, + UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") + ) + self.assertRaises( + UnicodeTranslateError, + codecs.strict_errors, + UnicodeTranslateError("\u3042", 0, 1, "ouch") + ) def test_badandgoodignoreexceptions(self): # "ignore" complains about a non-exception passed in @@ -527,13 +525,15 @@ class CodecCallbackTest(unittest.TestCase): UnicodeTranslateError("\u3042", 0, 1, "ouch") ) # Use the correct exception - cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 0x3042) + cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000, + 999999, 1000000) + cs += (0xd800, 0xdfff) s = "".join(chr(c) for c in cs) self.assertEqual( codecs.xmlcharrefreplace_errors( UnicodeEncodeError("ascii", s, 0, len(s), "ouch") ), - ("".join("&#%d;" % ord(c) for c in s), len(s)) + ("".join("&#%d;" % c for c in cs), len(s)) ) def test_badandgoodbackslashreplaceexceptions(self): @@ -561,55 +561,138 @@ class CodecCallbackTest(unittest.TestCase): UnicodeTranslateError("\u3042", 0, 1, "ouch") ) # Use the correct exception - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")), - ("\\u3042", 1) + tests = [ + ("\u3042", "\\u3042"), + ("\n", "\\x0a"), + ("a", "\\x61"), + ("\x00", "\\x00"), + ("\xff", "\\xff"), + ("\u0100", "\\u0100"), + ("\uffff", "\\uffff"), + ("\U00010000", "\\U00010000"), + ("\U0010ffff", "\\U0010ffff"), + # Lone surrogates + ("\ud800", "\\ud800"), + ("\udfff", "\\udfff"), + ("\ud800\udfff", "\\ud800\\udfff"), + ] + for s, r in tests: + with self.subTest(str=s): + self.assertEqual( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", s, 0, len(s), "ouch")), + (r, len(s)) + ) + + def test_badandgoodsurrogateescapeexceptions(self): + surrogateescape_errors = codecs.lookup_error('surrogateescape') + # "surrogateescape" complains about a non-exception passed in + self.assertRaises( + TypeError, + surrogateescape_errors, + 42 ) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")), - ("\\x00", 1) + # "surrogateescape" complains about the wrong exception types + self.assertRaises( + TypeError, + surrogateescape_errors, + UnicodeError("ouch") ) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")), - ("\\xff", 1) + # "surrogateescape" can not be used for translating + self.assertRaises( + TypeError, + surrogateescape_errors, + UnicodeTranslateError("\udc80", 0, 1, "ouch") ) + # Use the correct exception + for s in ("a", "\udc7f", "\udd00"): + with self.subTest(str=s): + self.assertRaises( + UnicodeEncodeError, + surrogateescape_errors, + UnicodeEncodeError("ascii", s, 0, 1, "ouch") + ) self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")), - ("\\u0100", 1) + surrogateescape_errors( + UnicodeEncodeError("ascii", "\udc80", 0, 1, "ouch")), + (b"\x80", 1) + ) + self.assertRaises( + UnicodeDecodeError, + surrogateescape_errors, + UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch") ) self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), - ("\\uffff", 1) - ) - if SIZEOF_WCHAR_T > 0: - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\U00010000", - 0, 1, "ouch")), - ("\\U00010000", 1) - ) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\U0010ffff", - 0, 1, "ouch")), - ("\\U0010ffff", 1) - ) - # Lone surrogates (regardless of unicode width) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")), - ("\\ud800", 1) - ) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")), - ("\\udfff", 1) - ) + surrogateescape_errors( + UnicodeDecodeError("ascii", bytearray(b"\x80"), 0, 1, "ouch")), + ("\udc80", 1) + ) + + def test_badandgoodsurrogatepassexceptions(self): + surrogatepass_errors = codecs.lookup_error('surrogatepass') + # "surrogatepass" complains about a non-exception passed in + self.assertRaises( + TypeError, + surrogatepass_errors, + 42 + ) + # "surrogatepass" complains about the wrong exception types + self.assertRaises( + TypeError, + surrogatepass_errors, + UnicodeError("ouch") + ) + # "surrogatepass" can not be used for translating + self.assertRaises( + TypeError, + surrogatepass_errors, + UnicodeTranslateError("\ud800", 0, 1, "ouch") + ) + # Use the correct exception + for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"): + with self.subTest(encoding=enc): + self.assertRaises( + UnicodeEncodeError, + surrogatepass_errors, + UnicodeEncodeError(enc, "a", 0, 1, "ouch") + ) + self.assertRaises( + UnicodeDecodeError, + surrogatepass_errors, + UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch") + ) + tests = [ + ("ascii", "\ud800", b'\xed\xa0\x80', 3), + ("utf-8", "\ud800", b'\xed\xa0\x80', 3), + ("utf-16le", "\ud800", b'\x00\xd8', 2), + ("utf-16be", "\ud800", b'\xd8\x00', 2), + ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4), + ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4), + ("ascii", "\udfff", b'\xed\xbf\xbf', 3), + ("utf-8", "\udfff", b'\xed\xbf\xbf', 3), + ("utf-16le", "\udfff", b'\xff\xdf', 2), + ("utf-16be", "\udfff", b'\xdf\xff', 2), + ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4), + ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4), + ("ascii", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3), + ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3), + ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2), + ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2), + ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4), + ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4), + ] + for enc, s, b, n in tests: + with self.subTest(encoding=enc, str=s, bytes=b): + self.assertEqual( + surrogatepass_errors( + UnicodeEncodeError(enc, s, 0, len(s), "ouch")), + (b, len(s)) + ) + self.assertEqual( + surrogatepass_errors( + UnicodeDecodeError(enc, bytearray(b[:n]), 0, n, "ouch")), + (s[:1], n) + ) def test_badhandlerresults(self): results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) @@ -688,9 +771,8 @@ class CodecCallbackTest(unittest.TestCase): # enhance coverage of: # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() # and inline implementations - v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000) - if SIZEOF_WCHAR_T == 4: - v += (100000, 500000, 1000000) + v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, + 500000, 1000000) s = "".join([chr(x) for x in v]) codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) for enc in ("ascii", "iso-8859-15"): -- cgit v0.12