diff options
Diffstat (limited to 'Lib/test/test_unicode.py')
| -rw-r--r-- | Lib/test/test_unicode.py | 373 |
1 files changed, 238 insertions, 135 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index f046938..81e49d6 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -4,10 +4,11 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import _string import codecs import itertools +import operator import struct import sys import unittest @@ -318,6 +319,7 @@ class UnicodeTest(string_tests.CommonTest, {ord('a'): None, ord('b'): ''}) self.checkequalnofix('xyyx', 'xzx', 'translate', {ord('z'): 'yy'}) + # this needs maketrans() self.checkequalnofix('abababc', 'abababc', 'translate', {'b': '<i>'}) @@ -327,6 +329,43 @@ class UnicodeTest(string_tests.CommonTest, tbl = self.type2test.maketrans('abc', 'xyz', 'd') self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl) + # various tests switching from ASCII to latin1 or the opposite; + # same length, remove a letter, or replace with a longer string. + self.assertEqual("[a]".translate(str.maketrans('a', 'X')), + "[X]") + self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})), + "[X]") + self.assertEqual("[a]".translate(str.maketrans({'a': None})), + "[]") + self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})), + "[XXX]") + self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})), + "[\xe9]") + self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})), + "x123") + self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})), + "x\xe9") + + # test non-ASCII (don't take the fast-path) + self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})), + "[<\xe9>]") + self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})), + "[a]") + self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})), + "[]") + self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})), + "[123]") + self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})), + "[<\u20ac>\xe9]") + + # invalid Unicode characters + invalid_char = 0x10ffff+1 + for before in "a\xe9\u20ac\U0010ffff": + mapping = str.maketrans({before: invalid_char}) + text = "[%s]" % before + self.assertRaises(ValueError, text.translate, mapping) + + # errors self.assertRaises(TypeError, self.type2test.maketrans) self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg') self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def') @@ -341,10 +380,6 @@ class UnicodeTest(string_tests.CommonTest, def test_split(self): string_tests.CommonTest.test_split(self) - # Mixed arguments - self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//') - self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//') - self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test') # test mixed kinds for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): left *= 9 @@ -526,7 +561,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertTrue('\ud800\udc02' < '\ud84d\udc56') def test_islower(self): - string_tests.MixinStrUnicodeUserStringTest.test_islower(self) + super().test_islower() self.checkequalnofix(False, '\u1FFc', 'islower') self.assertFalse('\u2167'.islower()) self.assertTrue('\u2177'.islower()) @@ -541,7 +576,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertFalse('\U0001F46F'.islower()) def test_isupper(self): - string_tests.MixinStrUnicodeUserStringTest.test_isupper(self) + super().test_isupper() if not sys.platform.startswith('java'): self.checkequalnofix(False, '\u1FFc', 'isupper') self.assertTrue('\u2167'.isupper()) @@ -557,7 +592,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertFalse('\U0001F46F'.isupper()) def test_istitle(self): - string_tests.MixinStrUnicodeUserStringTest.test_istitle(self) + super().test_istitle() self.checkequalnofix(True, '\u1FFc', 'istitle') self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle') @@ -569,7 +604,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch)) def test_isspace(self): - string_tests.MixinStrUnicodeUserStringTest.test_isspace(self) + super().test_isspace() self.checkequalnofix(True, '\u2000', 'isspace') self.checkequalnofix(True, '\u200a', 'isspace') self.checkequalnofix(False, '\u2014', 'isspace') @@ -579,13 +614,13 @@ class UnicodeTest(string_tests.CommonTest, self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) def test_isalnum(self): - string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self) + super().test_isalnum() for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch)) def test_isalpha(self): - string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self) + super().test_isalpha() self.checkequalnofix(True, '\u1FFc', 'isalpha') # non-BMP, cased self.assertTrue('\U00010401'.isalpha()) @@ -615,7 +650,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch)) def test_isdigit(self): - string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self) + super().test_isdigit() self.checkequalnofix(True, '\u2460', 'isdigit') self.checkequalnofix(False, '\xbc', 'isdigit') self.checkequalnofix(True, '\u0660', 'isdigit') @@ -768,7 +803,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') def test_title(self): - string_tests.MixinStrUnicodeUserStringTest.test_title(self) + super().test_title() self.assertEqual('\U0001044F'.title(), '\U00010427') self.assertEqual('\U0001044F\U0001044F'.title(), '\U00010427\U0001044F') @@ -1317,20 +1352,20 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9') #issue 19995 - class PsuedoInt: + class PseudoInt: def __init__(self, value): self.value = int(value) def __int__(self): return self.value def __index__(self): return self.value - class PsuedoFloat: + class PseudoFloat: def __init__(self, value): self.value = float(value) def __int__(self): return int(self.value) - pi = PsuedoFloat(3.1415) - letter_m = PsuedoInt(109) + pi = PseudoFloat(3.1415) + letter_m = PseudoInt(109) self.assertEqual('%x' % 42, '2a') self.assertEqual('%X' % 15, 'F') self.assertEqual('%o' % 9, '11') @@ -1339,11 +1374,11 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('%X' % letter_m, '6D') self.assertEqual('%o' % letter_m, '155') self.assertEqual('%c' % letter_m, 'm') - self.assertWarns(DeprecationWarning, '%x'.__mod__, pi), - self.assertWarns(DeprecationWarning, '%x'.__mod__, 3.14), - self.assertWarns(DeprecationWarning, '%X'.__mod__, 2.11), - self.assertWarns(DeprecationWarning, '%o'.__mod__, 1.79), - self.assertWarns(DeprecationWarning, '%c'.__mod__, pi), + self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14), + self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11), + self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79), + self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi), + self.assertRaises(TypeError, operator.mod, '%c', pi), def test_formatting_with_enum(self): # issue18780 @@ -1739,7 +1774,7 @@ class UnicodeTest(string_tests.CommonTest, def assertCorrectUTF8Decoding(self, seq, res, err): """ - Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when + Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when 'strict' is used, returns res when 'replace' is used, and that doesn't return anything when 'ignore' is used. """ @@ -2061,7 +2096,8 @@ class UnicodeTest(string_tests.CommonTest, 'cp863', 'cp865', 'cp866', 'cp1125', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', - 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1', + 'iso8859_7', 'iso8859_9', + 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1', 'mac_cyrillic', 'mac_latin2', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', @@ -2089,14 +2125,14 @@ class UnicodeTest(string_tests.CommonTest, 'cp863', 'cp865', 'cp866', 'cp1125', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_2', 'iso8859_4', 'iso8859_5', - 'iso8859_9', 'koi8_r', 'latin_1', + 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1', 'mac_cyrillic', 'mac_latin2', ### These have undefined mappings: #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', #'cp1256', 'cp1257', 'cp1258', #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', - #'iso8859_3', 'iso8859_6', 'iso8859_7', + #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048', #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', ### These fail the round-trip: @@ -2239,6 +2275,123 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual("%s" % s, '__str__ overridden') self.assertEqual("{}".format(s), '__str__ overridden') + def test_subclass_add(self): + class S(str): + def __add__(self, o): + return "3" + self.assertEqual(S("4") + S("5"), "3") + class S(str): + def __iadd__(self, o): + return "3" + s = S("1") + s += "4" + self.assertEqual(s, "3") + + def test_getnewargs(self): + text = 'abc' + args = text.__getnewargs__() + self.assertIsNot(args[0], text) + self.assertEqual(args[0], text) + self.assertEqual(len(args), 1) + + def test_resize(self): + for length in range(1, 100, 7): + # generate a fresh string (refcount=1) + text = 'a' * length + 'b' + + with support.check_warnings(('unicode_internal codec has been ' + 'deprecated', DeprecationWarning)): + # fill wstr internal field + abc = text.encode('unicode_internal') + self.assertEqual(abc.decode('unicode_internal'), text) + + # resize text: wstr field must be cleared and then recomputed + text += 'c' + abcdef = text.encode('unicode_internal') + self.assertNotEqual(abc, abcdef) + self.assertEqual(abcdef.decode('unicode_internal'), text) + + def test_compare(self): + # Issue #17615 + N = 10 + ascii = 'a' * N + ascii2 = 'z' * N + latin = '\x80' * N + latin2 = '\xff' * N + bmp = '\u0100' * N + bmp2 = '\uffff' * N + astral = '\U00100000' * N + astral2 = '\U0010ffff' * N + strings = ( + ascii, ascii2, + latin, latin2, + bmp, bmp2, + astral, astral2) + for text1, text2 in itertools.combinations(strings, 2): + equal = (text1 is text2) + self.assertEqual(text1 == text2, equal) + self.assertEqual(text1 != text2, not equal) + + if equal: + self.assertTrue(text1 <= text2) + self.assertTrue(text1 >= text2) + + # text1 is text2: duplicate strings to skip the "str1 == str2" + # optimization in unicode_compare_eq() and really compare + # character per character + copy1 = duplicate_string(text1) + copy2 = duplicate_string(text2) + self.assertIsNot(copy1, copy2) + + self.assertTrue(copy1 == copy2) + self.assertFalse(copy1 != copy2) + + self.assertTrue(copy1 <= copy2) + self.assertTrue(copy2 >= copy2) + + self.assertTrue(ascii < ascii2) + self.assertTrue(ascii < latin) + self.assertTrue(ascii < bmp) + self.assertTrue(ascii < astral) + self.assertFalse(ascii >= ascii2) + self.assertFalse(ascii >= latin) + self.assertFalse(ascii >= bmp) + self.assertFalse(ascii >= astral) + + self.assertFalse(latin < ascii) + self.assertTrue(latin < latin2) + self.assertTrue(latin < bmp) + self.assertTrue(latin < astral) + self.assertTrue(latin >= ascii) + self.assertFalse(latin >= latin2) + self.assertFalse(latin >= bmp) + self.assertFalse(latin >= astral) + + self.assertFalse(bmp < ascii) + self.assertFalse(bmp < latin) + self.assertTrue(bmp < bmp2) + self.assertTrue(bmp < astral) + self.assertTrue(bmp >= ascii) + self.assertTrue(bmp >= latin) + self.assertFalse(bmp >= bmp2) + self.assertFalse(bmp >= astral) + + self.assertFalse(astral < ascii) + self.assertFalse(astral < latin) + self.assertFalse(astral < bmp2) + self.assertTrue(astral < astral2) + self.assertTrue(astral >= ascii) + self.assertTrue(astral >= latin) + self.assertTrue(astral >= bmp2) + self.assertFalse(astral >= astral2) + + def test_free_after_iterating(self): + support.check_free_after_iterating(self, iter, str) + support.check_free_after_iterating(self, reversed, str) + + +class CAPITest(unittest.TestCase): + # Test PyUnicode_FromFormat() def test_from_format(self): support.import_module('ctypes') @@ -2534,17 +2687,65 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(size, nchar) self.assertEqual(wchar, nonbmp + '\0') - def test_subclass_add(self): - class S(str): - def __add__(self, o): - return "3" - self.assertEqual(S("4") + S("5"), "3") - class S(str): - def __iadd__(self, o): - return "3" - s = S("1") - s += "4" - self.assertEqual(s, "3") + # Test PyUnicode_AsUCS4() + @support.cpython_only + def test_asucs4(self): + from _testcapi import unicode_asucs4 + for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600', + 'a\ud800b\udfffc', '\ud834\udd1e']: + l = len(s) + self.assertEqual(unicode_asucs4(s, l, 1), s+'\0') + self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff') + self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff') + self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff') + self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1) + self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0) + s = '\0'.join([s, s]) + self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0') + self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff') + + # Test PyUnicode_CopyCharacters() + @support.cpython_only + def test_copycharacters(self): + from _testcapi import unicode_copycharacters + + strings = [ + 'abcde', '\xa1\xa2\xa3\xa4\xa5', + '\u4f60\u597d\u4e16\u754c\uff01', + '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' + ] + + for idx, from_ in enumerate(strings): + # wide -> narrow: exceed maxchar limitation + for to in strings[:idx]: + self.assertRaises( + SystemError, + unicode_copycharacters, to, 0, from_, 0, 5 + ) + # same kind + for from_start in range(5): + self.assertEqual( + unicode_copycharacters(from_, 0, from_, from_start, 5), + (from_[from_start:from_start+5].ljust(5, '\0'), + 5-from_start) + ) + for to_start in range(5): + self.assertEqual( + unicode_copycharacters(from_, to_start, from_, to_start, 5), + (from_[to_start:to_start+5].rjust(5, '\0'), + 5-to_start) + ) + # narrow -> wide + # Tests omitted since this creates invalid strings. + + s = strings[0] + self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) + self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) + self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) + self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) @support.cpython_only def test_encode_decimal(self): @@ -2574,104 +2775,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(transform_decimal('123\u20ac'), '123\u20ac') - def test_getnewargs(self): - text = 'abc' - args = text.__getnewargs__() - self.assertIsNot(args[0], text) - self.assertEqual(args[0], text) - self.assertEqual(len(args), 1) - - def test_resize(self): - for length in range(1, 100, 7): - # generate a fresh string (refcount=1) - text = 'a' * length + 'b' - - with support.check_warnings(('unicode_internal codec has been ' - 'deprecated', DeprecationWarning)): - # fill wstr internal field - abc = text.encode('unicode_internal') - self.assertEqual(abc.decode('unicode_internal'), text) - - # resize text: wstr field must be cleared and then recomputed - text += 'c' - abcdef = text.encode('unicode_internal') - self.assertNotEqual(abc, abcdef) - self.assertEqual(abcdef.decode('unicode_internal'), text) - - def test_compare(self): - # Issue #17615 - N = 10 - ascii = 'a' * N - ascii2 = 'z' * N - latin = '\x80' * N - latin2 = '\xff' * N - bmp = '\u0100' * N - bmp2 = '\uffff' * N - astral = '\U00100000' * N - astral2 = '\U0010ffff' * N - strings = ( - ascii, ascii2, - latin, latin2, - bmp, bmp2, - astral, astral2) - for text1, text2 in itertools.combinations(strings, 2): - equal = (text1 is text2) - self.assertEqual(text1 == text2, equal) - self.assertEqual(text1 != text2, not equal) - - if equal: - self.assertTrue(text1 <= text2) - self.assertTrue(text1 >= text2) - - # text1 is text2: duplicate strings to skip the "str1 == str2" - # optimization in unicode_compare_eq() and really compare - # character per character - copy1 = duplicate_string(text1) - copy2 = duplicate_string(text2) - self.assertIsNot(copy1, copy2) - - self.assertTrue(copy1 == copy2) - self.assertFalse(copy1 != copy2) - - self.assertTrue(copy1 <= copy2) - self.assertTrue(copy2 >= copy2) - - self.assertTrue(ascii < ascii2) - self.assertTrue(ascii < latin) - self.assertTrue(ascii < bmp) - self.assertTrue(ascii < astral) - self.assertFalse(ascii >= ascii2) - self.assertFalse(ascii >= latin) - self.assertFalse(ascii >= bmp) - self.assertFalse(ascii >= astral) - - self.assertFalse(latin < ascii) - self.assertTrue(latin < latin2) - self.assertTrue(latin < bmp) - self.assertTrue(latin < astral) - self.assertTrue(latin >= ascii) - self.assertFalse(latin >= latin2) - self.assertFalse(latin >= bmp) - self.assertFalse(latin >= astral) - - self.assertFalse(bmp < ascii) - self.assertFalse(bmp < latin) - self.assertTrue(bmp < bmp2) - self.assertTrue(bmp < astral) - self.assertTrue(bmp >= ascii) - self.assertTrue(bmp >= latin) - self.assertFalse(bmp >= bmp2) - self.assertFalse(bmp >= astral) - - self.assertFalse(astral < ascii) - self.assertFalse(astral < latin) - self.assertFalse(astral < bmp2) - self.assertTrue(astral < astral2) - self.assertTrue(astral >= ascii) - self.assertTrue(astral >= latin) - self.assertTrue(astral >= bmp2) - self.assertFalse(astral >= astral2) - @support.cpython_only def test_pep393_utf8_caching_bug(self): # Issue #25709: Problem with string concatenation and utf-8 cache |
