diff options
Diffstat (limited to 'Lib/test/test_unicode.py')
| -rw-r--r-- | Lib/test/test_unicode.py | 351 |
1 files changed, 276 insertions, 75 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 4793707..65b26c5 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -11,6 +11,7 @@ import sys import unittest import warnings from test import support, string_tests +import _string # Error handling (bad decoder return) def search_function(encoding): @@ -30,18 +31,11 @@ def search_function(encoding): return None codecs.register(search_function) -class UnicodeTest( - string_tests.CommonTest, - string_tests.MixinStrUnicodeUserStringTest, - string_tests.MixinStrUnicodeTest, - ): - type2test = str - - def setUp(self): - self.warning_filters = warnings.filters[:] +class UnicodeTest(string_tests.CommonTest, + string_tests.MixinStrUnicodeUserStringTest, + string_tests.MixinStrUnicodeTest): - def tearDown(self): - warnings.filters = self.warning_filters + type2test = str def checkequalnofix(self, result, object, methodname, *args): method = getattr(object, methodname) @@ -282,23 +276,18 @@ class UnicodeTest( self.assertRaises(TypeError, 'replace'.replace, "r", 42) def test_bytes_comparison(self): - warnings.simplefilter('ignore', BytesWarning) - self.assertEqual('abc' == b'abc', False) - self.assertEqual('abc' != b'abc', True) - self.assertEqual('abc' == bytearray(b'abc'), False) - self.assertEqual('abc' != bytearray(b'abc'), True) + with support.check_warnings(): + warnings.simplefilter('ignore', BytesWarning) + self.assertEqual('abc' == b'abc', False) + self.assertEqual('abc' != b'abc', True) + self.assertEqual('abc' == bytearray(b'abc'), False) + self.assertEqual('abc' != bytearray(b'abc'), True) def test_comparison(self): # Comparisons: self.assertEqual('abc', 'abc') - self.assertEqual('abc', 'abc') - self.assertEqual('abc', 'abc') - self.assertTrue('abcd' > 'abc') - self.assertTrue('abcd' > 'abc') self.assertTrue('abcd' > 'abc') self.assertTrue('abc' < 'abcd') - self.assertTrue('abc' < 'abcd') - self.assertTrue('abc' < 'abcd') if 0: # Move these tests to a Unicode collation module test... @@ -435,32 +424,32 @@ class UnicodeTest( def test_contains(self): # Testing Unicode contains method - self.assertTrue('a' in 'abdb') - self.assertTrue('a' in 'bdab') - self.assertTrue('a' in 'bdaba') - self.assertTrue('a' in 'bdba') - self.assertTrue('a' not in 'bdb') - self.assertTrue('a' in 'bdba') - self.assertTrue('a' in ('a',1,None)) - self.assertTrue('a' in (1,None,'a')) - self.assertTrue('a' in ('a',1,None)) - self.assertTrue('a' in (1,None,'a')) - self.assertTrue('a' not in ('x',1,'y')) - self.assertTrue('a' not in ('x',1,None)) - self.assertTrue('abcd' not in 'abcxxxx') - self.assertTrue('ab' in 'abcd') - self.assertTrue('ab' in 'abc') - self.assertTrue('ab' in (1,None,'ab')) - self.assertTrue('' in 'abc') - self.assertTrue('' in '') - self.assertTrue('' in 'abc') - self.assertTrue('\0' not in 'abc') - self.assertTrue('\0' in '\0abc') - self.assertTrue('\0' in 'abc\0') - self.assertTrue('a' in '\0abc') - self.assertTrue('asdf' in 'asdf') - self.assertTrue('asdf' not in 'asd') - self.assertTrue('asdf' not in '') + self.assertIn('a', 'abdb') + self.assertIn('a', 'bdab') + self.assertIn('a', 'bdaba') + self.assertIn('a', 'bdba') + self.assertNotIn('a', 'bdb') + self.assertIn('a', 'bdba') + self.assertIn('a', ('a',1,None)) + self.assertIn('a', (1,None,'a')) + self.assertIn('a', ('a',1,None)) + self.assertIn('a', (1,None,'a')) + self.assertNotIn('a', ('x',1,'y')) + self.assertNotIn('a', ('x',1,None)) + self.assertNotIn('abcd', 'abcxxxx') + self.assertIn('ab', 'abcd') + self.assertIn('ab', 'abc') + self.assertIn('ab', (1,None,'ab')) + self.assertIn('', 'abc') + self.assertIn('', '') + self.assertIn('', 'abc') + self.assertNotIn('\0', 'abc') + self.assertIn('\0', '\0abc') + self.assertIn('\0', 'abc\0') + self.assertIn('a', '\0abc') + self.assertIn('asdf', 'asdf') + self.assertNotIn('asdf', 'asd') + self.assertNotIn('asdf', '') self.assertRaises(TypeError, "abc".__contains__) @@ -620,13 +609,16 @@ class UnicodeTest( self.assertEqual('{0}'.format({}), '{}') self.assertEqual('{0}'.format([]), '[]') self.assertEqual('{0}'.format([1]), '[1]') - self.assertEqual('{0}'.format(E('data')), 'E(data)') - self.assertEqual('{0:^10}'.format(E('data')), ' E(data) ') - self.assertEqual('{0:^10s}'.format(E('data')), ' E(data) ') + self.assertEqual('{0:d}'.format(G('data')), 'G(data)') - self.assertEqual('{0:>15s}'.format(G('data')), ' string is data') self.assertEqual('{0!s}'.format(G('data')), 'string is data') + msg = 'object.__format__ with a non-empty format string is deprecated' + with support.check_warnings((msg, PendingDeprecationWarning)): + self.assertEqual('{0:^10}'.format(E('data')), ' E(data) ') + self.assertEqual('{0:^10s}'.format(E('data')), ' E(data) ') + self.assertEqual('{0:>15s}'.format(G('data')), ' string is data') + self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007, month=8, day=27)), @@ -657,8 +649,6 @@ class UnicodeTest( self.assertRaises(IndexError, '{1}'.format, 'abc') self.assertRaises(KeyError, '{x}'.format) self.assertRaises(ValueError, "}{".format) - self.assertRaises(ValueError, "{".format) - self.assertRaises(ValueError, "}".format) self.assertRaises(ValueError, "abc{0:{}".format) self.assertRaises(ValueError, "{0".format) self.assertRaises(IndexError, "{0.}".format) @@ -706,6 +696,46 @@ class UnicodeTest( self.assertRaises(ValueError, format, '', '#') self.assertRaises(ValueError, format, '', '#20') + def test_format_map(self): + self.assertEqual(''.format_map({}), '') + self.assertEqual('a'.format_map({}), 'a') + self.assertEqual('ab'.format_map({}), 'ab') + self.assertEqual('a{{'.format_map({}), 'a{') + self.assertEqual('a}}'.format_map({}), 'a}') + self.assertEqual('{{b'.format_map({}), '{b') + self.assertEqual('}}b'.format_map({}), '}b') + self.assertEqual('a{{b'.format_map({}), 'a{b') + + # using mappings + class Mapping(dict): + def __missing__(self, key): + return key + self.assertEqual('{hello}'.format_map(Mapping()), 'hello') + self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world') + + class InternalMapping: + def __init__(self): + self.mapping = {'a': 'hello'} + def __getitem__(self, key): + return self.mapping[key] + self.assertEqual('{a}'.format_map(InternalMapping()), 'hello') + + + class C: + def __init__(self, x=100): + self._x = x + def __format__(self, spec): + return spec + self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20') + + # test various errors + self.assertRaises(TypeError, '{'.format_map) + self.assertRaises(TypeError, '}'.format_map) + self.assertRaises(TypeError, 'a{'.format_map) + self.assertRaises(TypeError, 'a}'.format_map) + self.assertRaises(TypeError, '{a'.format_map) + self.assertRaises(TypeError, '}a'.format_map) + def test_format_auto_numbering(self): class C: def __init__(self, x=100): @@ -1106,6 +1136,10 @@ class UnicodeTest( self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict') self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x") self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x") + self.assertEqual('Andr\202 x'.encode('ascii', 'replace'), + 'Andr\202 x'.encode('ascii', errors='replace')) + self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'), + 'Andr\202 x'.encode(encoding='ascii', errors='ignore')) # Error handling (decoding) self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii') @@ -1135,8 +1169,13 @@ class UnicodeTest( # Error handling (wrong arguments) self.assertRaises(TypeError, "hello".encode, 42, 42, 42) - # Error handling (PyUnicode_EncodeDecimal()) - self.assertRaises(UnicodeError, int, "\u0200") + # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII()) + self.assertRaises(UnicodeError, int, "\ud800") + self.assertRaises(UnicodeError, int, "\udf00") + self.assertRaises(UnicodeError, float, "\ud800") + self.assertRaises(UnicodeError, float, "\udf00") + self.assertRaises(UnicodeError, complex, "\ud800") + self.assertRaises(UnicodeError, complex, "\udf00") def test_codecs(self): # Encoding @@ -1189,8 +1228,8 @@ class UnicodeTest( s = bytes(range(128)) for encoding in ( 'cp037', 'cp1026', - 'cp437', 'cp500', 'cp737', 'cp775', 'cp850', - 'cp852', 'cp855', 'cp860', 'cp861', 'cp862', + 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', + 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp865', 'cp866', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', @@ -1217,8 +1256,8 @@ class UnicodeTest( s = bytes(range(128, 256)) for encoding in ( 'cp037', 'cp1026', - 'cp437', 'cp500', 'cp737', 'cp775', 'cp850', - 'cp852', 'cp855', 'cp860', 'cp861', 'cp862', + 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', + 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp865', 'cp866', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_2', 'iso8859_4', 'iso8859_5', @@ -1266,21 +1305,20 @@ class UnicodeTest( y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") self.assertEqual(x, y) - # FIXME - #y = r'\U00100000' - #x = y.encode("raw-unicode-escape").decode("raw-unicode-escape") - #self.assertEqual(x, y) - #y = r'\U00010000' - #x = y.encode("raw-unicode-escape").decode("raw-unicode-escape") - #self.assertEqual(x, y) - - #try: - # '\U11111111'.decode("raw-unicode-escape") - #except UnicodeDecodeError as e: - # self.assertEqual(e.start, 0) - # self.assertEqual(e.end, 10) - #else: - # self.fail("Should have raised UnicodeDecodeError") + y = br'\U00100000' + x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") + self.assertEqual(x, y) + y = br'\U00010000' + x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") + self.assertEqual(x, y) + + try: + br'\U11111111'.decode("raw-unicode-escape") + except UnicodeDecodeError as e: + self.assertEqual(e.start, 0) + self.assertEqual(e.end, 10) + else: + self.fail("Should have raised UnicodeDecodeError") def test_conversion(self): # Make sure __unicode__() works properly @@ -1354,6 +1392,10 @@ class UnicodeTest( self.assertEqual(repr(s1()), '\\n') self.assertEqual(repr(s2()), '\\n') + def test_printable_repr(self): + self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable + self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable + def test_expandtabs_overflows_gracefully(self): # This test only affects 32-bit platforms because expandtabs can only take # an int as the max value, not a 64-bit C long. If expandtabs is changed @@ -1382,6 +1424,165 @@ class UnicodeTest( self.assertEqual("%s" % s, '__str__ overridden') self.assertEqual("{}".format(s), '__str__ overridden') + # Test PyUnicode_FromFormat() + def test_from_format(self): + support.import_module('ctypes') + from ctypes import pythonapi, py_object, c_int + if sys.maxunicode == 65535: + name = "PyUnicodeUCS2_FromFormat" + else: + name = "PyUnicodeUCS4_FromFormat" + _PyUnicode_FromFormat = getattr(pythonapi, name) + _PyUnicode_FromFormat.restype = py_object + + def PyUnicode_FromFormat(format, *args): + cargs = tuple( + py_object(arg) if isinstance(arg, str) else arg + for arg in args) + return _PyUnicode_FromFormat(format, *cargs) + + # ascii format, non-ascii argument + text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9') + self.assertEqual(text, 'ascii\x7f=unicode\xe9') + + # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() + # raises an error + self.assertRaisesRegex(ValueError, + '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' + 'string, got a non-ASCII byte: 0xe9$', + PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') + + self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd') + self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff') + + # other tests + text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') + self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'") + + text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz') + self.assertEqual(text, 'repr=abc') + + # Test string decode from parameter of %s using utf-8. + # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of + # '\u4eba\u6c11' + text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') + self.assertEqual(text, 'repr=\u4eba\u6c11') + + #Test replace error handler. + text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff') + self.assertEqual(text, 'repr=abc\ufffd') + + # Test PyUnicode_AsWideChar() + def test_aswidechar(self): + from _testcapi import unicode_aswidechar + support.import_module('ctypes') + from ctypes import c_wchar, sizeof + + wchar, size = unicode_aswidechar('abcdef', 2) + self.assertEqual(size, 2) + self.assertEqual(wchar, 'ab') + + wchar, size = unicode_aswidechar('abc', 3) + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc') + + wchar, size = unicode_aswidechar('abc', 4) + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc\0') + + wchar, size = unicode_aswidechar('abc', 10) + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc\0') + + wchar, size = unicode_aswidechar('abc\0def', 20) + self.assertEqual(size, 7) + self.assertEqual(wchar, 'abc\0def\0') + + nonbmp = chr(0x10ffff) + if sizeof(c_wchar) == 2: + buflen = 3 + nchar = 2 + else: # sizeof(c_wchar) == 4 + buflen = 2 + nchar = 1 + wchar, size = unicode_aswidechar(nonbmp, buflen) + self.assertEqual(size, nchar) + self.assertEqual(wchar, nonbmp + '\0') + + # Test PyUnicode_AsWideCharString() + def test_aswidecharstring(self): + from _testcapi import unicode_aswidecharstring + support.import_module('ctypes') + from ctypes import c_wchar, sizeof + + wchar, size = unicode_aswidecharstring('abc') + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc\0') + + wchar, size = unicode_aswidecharstring('abc\0def') + self.assertEqual(size, 7) + self.assertEqual(wchar, 'abc\0def\0') + + nonbmp = chr(0x10ffff) + if sizeof(c_wchar) == 2: + nchar = 2 + else: # sizeof(c_wchar) == 4 + nchar = 1 + wchar, size = unicode_aswidecharstring(nonbmp) + self.assertEqual(size, nchar) + self.assertEqual(wchar, nonbmp + '\0') + + +class StringModuleTest(unittest.TestCase): + def test_formatter_parser(self): + def parse(format): + return list(_string.formatter_parser(format)) + + formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}") + self.assertEqual(formatter, [ + ('prefix ', '2', '', 's'), + ('xxx', '0', '^+10.3f', None), + ('', 'obj.attr', '', 's'), + (' ', 'z[0]', '10', 's'), + ]) + + formatter = parse("prefix {} suffix") + self.assertEqual(formatter, [ + ('prefix ', '', '', None), + (' suffix', None, None, None), + ]) + + formatter = parse("str") + self.assertEqual(formatter, [ + ('str', None, None, None), + ]) + + formatter = parse("") + self.assertEqual(formatter, []) + + formatter = parse("{0}") + self.assertEqual(formatter, [ + ('', '0', '', None), + ]) + + self.assertRaises(TypeError, _string.formatter_parser, 1) + + def test_formatter_field_name_split(self): + def split(name): + items = list(_string.formatter_field_name_split(name)) + items[1] = list(items[1]) + return items + self.assertEqual(split("obj"), ["obj", []]) + self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]]) + self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]]) + self.assertEqual(split("obj.arg[key1][key2]"), [ + "obj", + [(True, 'arg'), + (False, 'key1'), + (False, 'key2'), + ]]) + self.assertRaises(TypeError, _string.formatter_field_name_split, 1) + def test_main(): support.run_unittest(__name__) |
