diff options
Diffstat (limited to 'Lib/test/test_unicode.py')
-rw-r--r-- | Lib/test/test_unicode.py | 556 |
1 files changed, 460 insertions, 96 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 47af8b9..9aaedd3 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -5,17 +5,13 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" +import _string import codecs import struct import sys import unittest import warnings from test import support, string_tests -import _string - -# decorator to skip tests on narrow builds -requires_wide_build = unittest.skipIf(sys.maxunicode == 65535, - 'requires wide build') # Error handling (bad decoder return) def search_function(encoding): @@ -175,6 +171,15 @@ class UnicodeTest(string_tests.CommonTest, def test_find(self): string_tests.CommonTest.test_find(self) + # test implementation details of the memchr fast path + self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102') + self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201') + self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120') + self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220') + self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304') + self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204') + self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004') + # check mixed argument types self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc') self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1) self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4) @@ -184,6 +189,14 @@ class UnicodeTest(string_tests.CommonTest, def test_rfind(self): string_tests.CommonTest.test_rfind(self) + # test implementation details of the memrchr fast path + self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102') + self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201') + self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120') + self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220') + self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304') + self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204') + self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004') # check mixed argument types self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '') @@ -280,6 +293,12 @@ class UnicodeTest(string_tests.CommonTest, self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1) self.assertRaises(TypeError, 'replace'.replace, "r", 42) + @support.cpython_only + def test_replace_id(self): + pattern = 'abc' + text = 'abc def' + self.assertIs(text.replace(pattern, pattern), text) + def test_bytes_comparison(self): with support.check_warnings(): warnings.simplefilter('ignore', BytesWarning) @@ -350,6 +369,8 @@ class UnicodeTest(string_tests.CommonTest, def test_islower(self): string_tests.MixinStrUnicodeUserStringTest.test_islower(self) self.checkequalnofix(False, '\u1FFc', 'islower') + self.assertFalse('\u2167'.islower()) + self.assertTrue('\u2177'.islower()) # non-BMP, uppercase self.assertFalse('\U00010401'.islower()) self.assertFalse('\U00010427'.islower()) @@ -364,6 +385,8 @@ class UnicodeTest(string_tests.CommonTest, string_tests.MixinStrUnicodeUserStringTest.test_isupper(self) if not sys.platform.startswith('java'): self.checkequalnofix(False, '\u1FFc', 'isupper') + self.assertTrue('\u2167'.isupper()) + self.assertFalse('\u2177'.isupper()) # non-BMP, uppercase self.assertTrue('\U00010401'.isupper()) self.assertTrue('\U00010427'.isupper()) @@ -520,7 +543,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) - @requires_wide_build def test_lower(self): string_tests.CommonTest.test_lower(self) self.assertEqual('\U00010427'.lower(), '\U0001044F') @@ -530,8 +552,28 @@ class UnicodeTest(string_tests.CommonTest, '\U0001044F\U0001044F') self.assertEqual('X\U00010427x\U0001044F'.lower(), 'x\U0001044Fx\U0001044F') + self.assertEqual('fi'.lower(), 'fi') + self.assertEqual('\u0130'.lower(), '\u0069\u0307') + # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 + self.assertEqual('\u03a3'.lower(), '\u03c3') + self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3') + self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') + self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a') + self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') + self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345') + self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ') + self.assertEqual('\U0008fffe'.lower(), '\U0008fffe') + self.assertEqual('\u2177'.lower(), '\u2177') + + def test_casefold(self): + self.assertEqual('hello'.casefold(), 'hello') + self.assertEqual('hELlo'.casefold(), 'hello') + self.assertEqual('ß'.casefold(), 'ss') + self.assertEqual('fi'.casefold(), 'fi') + self.assertEqual('\u03a3'.casefold(), '\u03c3') + self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3') + self.assertEqual('\u00b5'.casefold(), '\u03bc') - @requires_wide_build def test_upper(self): string_tests.CommonTest.test_upper(self) self.assertEqual('\U0001044F'.upper(), '\U00010427') @@ -541,8 +583,14 @@ class UnicodeTest(string_tests.CommonTest, '\U00010427\U00010427') self.assertEqual('X\U00010427x\U0001044F'.upper(), 'X\U00010427X\U00010427') + self.assertEqual('fi'.upper(), 'FI') + self.assertEqual('\u0130'.upper(), '\u0130') + self.assertEqual('\u03a3'.upper(), '\u03a3') + self.assertEqual('ß'.upper(), 'SS') + self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300') + self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') + self.assertEqual('\u2177'.upper(), '\u2167') - @requires_wide_build def test_capitalize(self): string_tests.CommonTest.test_capitalize(self) self.assertEqual('\U0001044F'.capitalize(), '\U00010427') @@ -554,8 +602,12 @@ class UnicodeTest(string_tests.CommonTest, '\U00010427\U0001044F') self.assertEqual('X\U00010427x\U0001044F'.capitalize(), 'X\U0001044Fx\U0001044F') + self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307') + exp = '\u0399\u0308\u0300\u0069\u0307' + self.assertEqual('\u1fd2\u0130'.capitalize(), exp) + self.assertEqual('finnish'.capitalize(), 'FInnish') + self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') - @requires_wide_build def test_title(self): string_tests.MixinStrUnicodeUserStringTest.test_title(self) self.assertEqual('\U0001044F'.title(), '\U00010427') @@ -569,8 +621,10 @@ class UnicodeTest(string_tests.CommonTest, '\U00010427\U0001044F \U00010427\U0001044F') self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') + self.assertEqual('fiNNISH'.title(), 'Finnish') + self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy') + self.assertEqual('A\u03a3A'.title(), 'A\u03c3a') - @requires_wide_build def test_swapcase(self): string_tests.CommonTest.test_swapcase(self) self.assertEqual('\U0001044F'.swapcase(), '\U00010427') @@ -583,6 +637,19 @@ class UnicodeTest(string_tests.CommonTest, '\U00010427\U0001044F') self.assertEqual('X\U00010427x\U0001044F'.swapcase(), 'x\U0001044FX\U00010427') + self.assertEqual('fi'.swapcase(), 'FI') + self.assertEqual('\u0130'.swapcase(), '\u0069\u0307') + # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 + self.assertEqual('\u03a3'.swapcase(), '\u03c3') + self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3') + self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') + self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A') + self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') + self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399') + self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ') + self.assertEqual('\u03a3'.swapcase(), '\u03c3') + self.assertEqual('ß'.swapcase(), 'SS') + self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300') def test_contains(self): # Testing Unicode contains method @@ -776,7 +843,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('{0!s}'.format(G('data')), 'string is data') msg = 'object.__format__ with a non-empty format string is deprecated' - with support.check_warnings((msg, PendingDeprecationWarning)): + with support.check_warnings((msg, DeprecationWarning)): self.assertEqual('{0:^10}'.format(E('data')), ' E(data) ') self.assertEqual('{0:^10s}'.format(E('data')), ' E(data) ') self.assertEqual('{0:>15s}'.format(G('data')), ' string is data') @@ -858,6 +925,14 @@ class UnicodeTest(string_tests.CommonTest, self.assertRaises(ValueError, format, '', '#') self.assertRaises(ValueError, format, '', '#20') + # Non-ASCII + self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"), + 'ABC\u0410\u0411\u0412') + self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"), + 'ABC') + self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"), + '') + def test_format_map(self): self.assertEqual(''.format_map({}), '') self.assertEqual('a'.format_map({}), 'a') @@ -1005,6 +1080,10 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('%f' % INF, 'inf') self.assertEqual('%F' % INF, 'INF') + # PEP 393 + self.assertEqual('%.1s' % "a\xe9\u20ac", 'a') + self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9') + @support.cpython_only def test_formatting_huge_precision(self): from _testcapi import INT_MAX @@ -1041,10 +1120,13 @@ class UnicodeTest(string_tests.CommonTest, class UnicodeSubclass(str): pass - self.assertEqual( - str(UnicodeSubclass('unicode subclass becomes unicode')), - 'unicode subclass becomes unicode' - ) + for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'): + subclass = UnicodeSubclass(text) + self.assertEqual(str(subclass), text) + self.assertEqual(len(subclass), len(text)) + if text == 'ascii': + self.assertEqual(subclass.encode('ascii'), b'ascii') + self.assertEqual(subclass.encode('utf-8'), b'ascii') self.assertEqual( str('strings are converted to unicode'), @@ -1170,15 +1252,12 @@ class UnicodeTest(string_tests.CommonTest, def test_codecs_utf8(self): self.assertEqual(''.encode('utf-8'), b'') self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac') - if sys.maxunicode == 65535: - self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82') - self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96') + self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82') + self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96') self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80') self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80') - if sys.maxunicode == 65535: - self.assertEqual( - ('\ud800\udc02'*1000).encode('utf-8'), - b'\xf0\x90\x80\x82'*1000) + self.assertEqual(('\U00010002'*10).encode('utf-8'), + b'\xf0\x90\x80\x82'*10) self.assertEqual( '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' @@ -1290,7 +1369,7 @@ class UnicodeTest(string_tests.CommonTest, # with start byte of a 2-byte sequence (b'\xc2', FFFD), # only the start byte (b'\xc2\xc2', FFFD*2), # 2 start bytes - (b'\xc2\xc2\xc2', FFFD*3), # 2 start bytes + (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes (b'\xc2\x41', FFFD+'A'), # invalid continuation byte # with start byte of a 3-byte sequence (b'\xe1', FFFD), # only the start byte @@ -1360,6 +1439,226 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(seq.decode('utf-8', 'ignore'), res.replace('\uFFFD', '')) + def to_bytestring(self, seq): + return bytes(int(c, 16) for c in seq.split()) + + def assertCorrectUTF8Decoding(self, seq, res, err): + """ + Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when + 'strict' is used, returns res when 'replace' is used, and that doesn't + return anything when 'ignore' is used. + """ + with self.assertRaises(UnicodeDecodeError) as cm: + seq.decode('utf-8') + exc = cm.exception + + self.assertIn(err, str(exc)) + self.assertEqual(seq.decode('utf-8', 'replace'), res) + self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'), + 'aaaa' + res + 'bbbb') + res = res.replace('\ufffd', '') + self.assertEqual(seq.decode('utf-8', 'ignore'), res) + self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'), + 'aaaa' + res + 'bbbb') + + def test_invalid_start_byte(self): + """ + Test that an 'invalid start byte' error is raised when the first byte + is not in the ASCII range or is not a valid start byte of a 2-, 3-, or + 4-bytes sequence. The invalid start byte is replaced with a single + U+FFFD when errors='replace'. + E.g. <80> is a continuation byte and can appear only after a start byte. + """ + FFFD = '\ufffd' + for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF': + self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd', + 'invalid start byte') + + def test_unexpected_end_of_data(self): + """ + Test that an 'unexpected end of data' error is raised when the string + ends after a start byte of a 2-, 3-, or 4-bytes sequence without having + enough continuation bytes. The incomplete sequence is replaced with a + single U+FFFD when errors='replace'. + E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes + sequence, but it's followed by only 2 valid continuation bytes and the + last continuation bytes is missing. + Note: the continuation bytes must be all valid, if one of them is + invalid another error will be raised. + """ + sequences = [ + 'C2', 'DF', + 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF', + 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF', + 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF', + 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF', + 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF', + 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF' + ] + FFFD = '\ufffd' + for seq in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd', + 'unexpected end of data') + + def test_invalid_cb_for_2bytes_seq(self): + """ + Test that an 'invalid continuation byte' error is raised when the + continuation byte of a 2-bytes sequence is invalid. The start byte + is replaced by a single U+FFFD and the second byte is handled + separately when errors='replace'. + E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes + sequence, but 41 is not a valid continuation byte because it's the + ASCII letter 'A'. + """ + FFFD = '\ufffd' + FFFDx2 = FFFD * 2 + sequences = [ + ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'), + ('C2 C0', FFFDx2), ('C2 FF', FFFDx2), + ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'), + ('DF C0', FFFDx2), ('DF FF', FFFDx2), + ] + for seq, res in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res, + 'invalid continuation byte') + + def test_invalid_cb_for_3bytes_seq(self): + """ + Test that an 'invalid continuation byte' error is raised when the + continuation byte(s) of a 3-bytes sequence are invalid. When + errors='replace', if the first continuation byte is valid, the first + two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the + third byte is handled separately, otherwise only the start byte is + replaced with a U+FFFD and the other continuation bytes are handled + separately. + E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes + sequence, 80 is a valid continuation byte, but 41 is not a valid cb + because it's the ASCII letter 'A'. + Note: when the start byte is E0 or ED, the valid ranges for the first + continuation byte are limited to A0..BF and 80..9F respectively. + Python 2 used to consider all the bytes in range 80..BF valid when the + start byte was ED. This is fixed in Python 3. + """ + FFFD = '\ufffd' + FFFDx2 = FFFD * 2 + sequences = [ + ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2), + ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2), + ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'), + ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2), + ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'), + ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'), + ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2), + ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'), + ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2), + ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'), + ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'), + ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2), + ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'), + ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2), + ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'), + ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'), + ('ED 7F', FFFD+'\x7f'), + ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^ + ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'), + ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2), + ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'), + ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2), + ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'), + ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2), + ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'), + ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2), + ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'), + ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'), + ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2), + ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'), + ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2), + ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'), + ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2), + ] + for seq, res in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res, + 'invalid continuation byte') + + def test_invalid_cb_for_4bytes_seq(self): + """ + Test that an 'invalid continuation byte' error is raised when the + continuation byte(s) of a 4-bytes sequence are invalid. When + errors='replace',the start byte and all the following valid + continuation bytes are replaced with a single U+FFFD, and all the bytes + starting from the first invalid continuation bytes (included) are + handled separately. + E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes + sequence, 80 is a valid continuation byte, but 41 is not a valid cb + because it's the ASCII letter 'A'. + Note: when the start byte is E0 or ED, the valid ranges for the first + continuation byte are limited to A0..BF and 80..9F respectively. + However, when the start byte is ED, Python 2 considers all the bytes + in range 80..BF valid. This is fixed in Python 3. + """ + FFFD = '\ufffd' + FFFDx2 = FFFD * 2 + sequences = [ + ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2), + ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2), + ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'), + ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2), + ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'), + ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2), + ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'), + ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2), + ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'), + ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2), + ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'), + ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2), + ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'), + ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2), + ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2), + ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'), + ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2), + ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'), + ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2), + ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'), + ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2), + ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'), + ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2), + ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'), + ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2), + ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'), + ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2), + ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'), + ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2), + ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'), + ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2), + ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'), + ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2), + ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'), + ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2), + ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'), + ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2), + ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'), + ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2), + ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'), + ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2), + ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2), + ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2), + ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'), + ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2), + ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'), + ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2), + ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'), + ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2), + ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'), + ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2), + ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'), + ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2), + ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'), + ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2) + ] + for seq, res in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res, + 'invalid continuation byte') + def test_codecs_idna(self): # Test whether trailing dot is preserved self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.") @@ -1391,14 +1690,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertRaises(TypeError, str, b"hello", "test.unicode2") self.assertRaises(TypeError, "hello".encode, "test.unicode1") self.assertRaises(TypeError, "hello".encode, "test.unicode2") - # executes PyUnicode_Encode() - import imp - self.assertRaises( - ImportError, - imp.find_module, - "non-existing module", - ["non-existing dir"] - ) # Error handling (wrong arguments) self.assertRaises(TypeError, "hello".encode, 42, 42, 42) @@ -1416,18 +1707,25 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('hello'.encode('ascii'), b'hello') self.assertEqual('hello'.encode('utf-7'), b'hello') self.assertEqual('hello'.encode('utf-8'), b'hello') - self.assertEqual('hello'.encode('utf8'), b'hello') + self.assertEqual('hello'.encode('utf-8'), b'hello') self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000') self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o') self.assertEqual('hello'.encode('latin-1'), b'hello') + # Default encoding is utf-8 + self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83') + # Roundtrip safety for BMP (just the first 1024 chars) for c in range(1024): u = chr(c) for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): - self.assertEqual(str(u.encode(encoding),encoding), u) + with warnings.catch_warnings(): + # unicode-internal has been deprecated + warnings.simplefilter("ignore", DeprecationWarning) + + self.assertEqual(str(u.encode(encoding),encoding), u) # Roundtrip safety for BMP (just the first 256 chars) for c in range(256): @@ -1442,18 +1740,20 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(str(u.encode(encoding),encoding), u) # Roundtrip safety for non-BMP (just a few chars) - u = '\U00010001\U00020002\U00030003\U00040004\U00050005' - for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', - #'raw_unicode_escape', - 'unicode_escape', 'unicode_internal'): - self.assertEqual(str(u.encode(encoding),encoding), u) + with warnings.catch_warnings(): + # unicode-internal has been deprecated + warnings.simplefilter("ignore", DeprecationWarning) - # UTF-8 must be roundtrip safe for all UCS-2 code points - # This excludes surrogates: in the full range, there would be - # a surrogate pair (\udbff\udc00), which gets converted back - # to a non-BMP character (\U0010fc00) - u = ''.join(map(chr, list(range(0,0xd800)) + - list(range(0xe000,0x10000)))) + u = '\U00010001\U00020002\U00030003\U00040004\U00050005' + for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', + 'raw_unicode_escape', + 'unicode_escape', 'unicode_internal'): + self.assertEqual(str(u.encode(encoding),encoding), u) + + # UTF-8 must be roundtrip safe for all code points + # (except surrogates, which are forbidden). + u = ''.join(map(chr, list(range(0, 0xd800)) + + list(range(0xe000, 0x110000)))) for encoding in ('utf-8',): self.assertEqual(str(u.encode(encoding),encoding), u) @@ -1638,17 +1938,39 @@ class UnicodeTest(string_tests.CommonTest, return self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize) + @support.cpython_only + def test_expandtabs_optimization(self): + s = 'abc' + self.assertIs(s.expandtabs(), s) + def test_raiseMemError(self): - # Ensure that the freelist contains a consistent object, even - # when a string allocation fails with a MemoryError. - # This used to crash the interpreter, - # or leak references when the number was smaller. - charwidth = 4 if sys.maxunicode >= 0x10000 else 2 - # Note: sys.maxsize is half of the actual max allocation because of - # the signedness of Py_ssize_t. - alloc = lambda: "a" * (sys.maxsize // charwidth * 2) - self.assertRaises(MemoryError, alloc) - self.assertRaises(MemoryError, alloc) + if struct.calcsize('P') == 8: + # 64 bits pointers + ascii_struct_size = 48 + compact_struct_size = 72 + else: + # 32 bits pointers + ascii_struct_size = 24 + compact_struct_size = 36 + + for char in ('a', '\xe9', '\u20ac', '\U0010ffff'): + code = ord(char) + if code < 0x100: + char_size = 1 # sizeof(Py_UCS1) + struct_size = ascii_struct_size + elif code < 0x10000: + char_size = 2 # sizeof(Py_UCS2) + struct_size = compact_struct_size + else: + char_size = 4 # sizeof(Py_UCS4) + struct_size = compact_struct_size + # Note: sys.maxsize is half of the actual max allocation because of + # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle + # be allocatable, given enough memory. + maxlen = ((sys.maxsize - struct_size) // char_size) + alloc = lambda: char * maxlen + self.assertRaises(MemoryError, alloc) + self.assertRaises(MemoryError, alloc) def test_format_subclass(self): class S(str): @@ -1661,11 +1983,10 @@ class UnicodeTest(string_tests.CommonTest, # Test PyUnicode_FromFormat() def test_from_format(self): support.import_module('ctypes') - from ctypes import pythonapi, py_object, c_int - if sys.maxunicode == 65535: - name = "PyUnicodeUCS2_FromFormat" - else: - name = "PyUnicodeUCS4_FromFormat" + from ctypes import (pythonapi, py_object, + c_int, c_long, c_longlong, c_ssize_t, + c_uint, c_ulong, c_ulonglong, c_size_t) + name = "PyUnicode_FromFormat" _PyUnicode_FromFormat = getattr(pythonapi, name) _PyUnicode_FromFormat.restype = py_object @@ -1686,13 +2007,40 @@ class UnicodeTest(string_tests.CommonTest, 'string, got a non-ASCII byte: 0xe9$', PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') + # test "%c" self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd') self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff') - # other tests + # test "%" + self.assertEqual(PyUnicode_FromFormat(b'%'), '%') + self.assertEqual(PyUnicode_FromFormat(b'%%'), '%') + self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s') + self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]') + self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc') + + # test integer formats (%i, %d, %u) + self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010') + self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010') + self.assertEqual(PyUnicode_FromFormat(b'%i', c_int(-123)), '-123') + self.assertEqual(PyUnicode_FromFormat(b'%li', c_long(-123)), '-123') + self.assertEqual(PyUnicode_FromFormat(b'%lli', c_longlong(-123)), '-123') + self.assertEqual(PyUnicode_FromFormat(b'%zi', c_ssize_t(-123)), '-123') + + self.assertEqual(PyUnicode_FromFormat(b'%d', c_int(-123)), '-123') + self.assertEqual(PyUnicode_FromFormat(b'%ld', c_long(-123)), '-123') + self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(-123)), '-123') + self.assertEqual(PyUnicode_FromFormat(b'%zd', c_ssize_t(-123)), '-123') + + self.assertEqual(PyUnicode_FromFormat(b'%u', c_uint(123)), '123') + self.assertEqual(PyUnicode_FromFormat(b'%lu', c_ulong(123)), '123') + self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123') + self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123') + + # test %A text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'") + # test %V text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz') self.assertEqual(text, 'repr=abc') @@ -1706,6 +2054,13 @@ class UnicodeTest(string_tests.CommonTest, text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff') self.assertEqual(text, 'repr=abc\ufffd') + # not supported: copy the raw format string. these tests are just here + # to check for crashs and should not be considered as specifications + self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s') + self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc') + self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i') + self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s') + # Test PyUnicode_AsWideChar() def test_aswidechar(self): from _testcapi import unicode_aswidechar @@ -1766,6 +2121,51 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(size, nchar) self.assertEqual(wchar, nonbmp + '\0') + def test_subclass_add(self): + class S(str): + def __add__(self, o): + return "3" + self.assertEqual(S("4") + S("5"), "3") + class S(str): + def __iadd__(self, o): + return "3" + s = S("1") + s += "4" + self.assertEqual(s, "3") + + def test_encode_decimal(self): + from _testcapi import unicode_encodedecimal + self.assertEqual(unicode_encodedecimal('123'), + b'123') + self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), + b'3.14') + self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"), + b' 3.14 ') + self.assertRaises(UnicodeEncodeError, + unicode_encodedecimal, "123\u20ac", "strict") + self.assertRaisesRegex( + ValueError, + "^'decimal' codec can't encode character", + unicode_encodedecimal, "123\u20ac", "replace") + + def test_transform_decimal(self): + from _testcapi import unicode_transformdecimaltoascii as transform_decimal + self.assertEqual(transform_decimal('123'), + '123') + self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), + '3.14') + self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), + "\N{EM SPACE}3.14\N{EN SPACE}") + self.assertEqual(transform_decimal('123\u20ac'), + '123\u20ac') + + def test_getnewargs(self): + text = 'abc' + args = text.__getnewargs__() + self.assertIsNot(args[0], text) + self.assertEqual(args[0], text) + self.assertEqual(len(args), 1) + class StringModuleTest(unittest.TestCase): def test_formatter_parser(self): @@ -1817,42 +2217,6 @@ class StringModuleTest(unittest.TestCase): ]]) self.assertRaises(TypeError, _string.formatter_field_name_split, 1) - def test_encode_decimal(self): - from _testcapi import unicode_encodedecimal - self.assertEqual(unicode_encodedecimal('123'), - b'123') - self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), - b'3.14') - self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"), - b' 3.14 ') - self.assertRaises(UnicodeEncodeError, - unicode_encodedecimal, "123\u20ac", "strict") - self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"), - b'123?') - self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"), - b'123') - self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"), - b'123€') - self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"), - b'123\\u20ac') - self.assertEqual(unicode_encodedecimal("123\u20ac\N{EM SPACE}", "replace"), - b'123? ') - self.assertEqual(unicode_encodedecimal("123\u20ac\u20ac", "replace"), - b'123??') - self.assertEqual(unicode_encodedecimal("123\u20ac\u0660", "replace"), - b'123?0') - - def test_transform_decimal(self): - from _testcapi import unicode_transformdecimaltoascii as transform_decimal - self.assertEqual(transform_decimal('123'), - '123') - self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), - '3.14') - self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), - "\N{EM SPACE}3.14\N{EN SPACE}") - self.assertEqual(transform_decimal('123\u20ac'), - '123\u20ac') - def test_main(): support.run_unittest(__name__) |