diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2011-09-28 05:41:54 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2011-09-28 05:41:54 (GMT) |
commit | d63a3b8beb4a0841cb59fb3515347ccaab34b733 (patch) | |
tree | 3b4e3cc63151c5a5a910c3550a190aefaea96ad4 /Lib | |
parent | 48d49497c50e79d14e9df9527d766ca3a0a38be5 (diff) | |
download | cpython-d63a3b8beb4a0841cb59fb3515347ccaab34b733.zip cpython-d63a3b8beb4a0841cb59fb3515347ccaab34b733.tar.gz cpython-d63a3b8beb4a0841cb59fb3515347ccaab34b733.tar.bz2 |
Implement PEP 393.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/json/decoder.py | 3 | ||||
-rw-r--r-- | Lib/test/json_tests/test_scanstring.py | 11 | ||||
-rw-r--r-- | Lib/test/test_codeccallbacks.py | 7 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 4 | ||||
-rw-r--r-- | Lib/test/test_peepholer.py | 4 | ||||
-rw-r--r-- | Lib/test/test_re.py | 7 | ||||
-rw-r--r-- | Lib/test/test_sys.py | 38 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 41 |
8 files changed, 79 insertions, 36 deletions
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py index 3174e31..e7c0539 100644 --- a/Lib/json/decoder.py +++ b/Lib/json/decoder.py @@ -121,8 +121,7 @@ def py_scanstring(s, end, strict=True, msg = "Invalid \\uXXXX escape" raise ValueError(errmsg(msg, s, end)) uni = int(esc, 16) - # Check for surrogate pair on UCS-4 systems - if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: + if 0xd800 <= uni <= 0xdbff: msg = "Invalid \\uXXXX\\uXXXX surrogate pair" if not s[end + 5:end + 7] == '\\u': raise ValueError(errmsg(msg, s, end)) diff --git a/Lib/test/json_tests/test_scanstring.py b/Lib/test/json_tests/test_scanstring.py index f82cdee..426c8dd 100644 --- a/Lib/test/json_tests/test_scanstring.py +++ b/Lib/test/json_tests/test_scanstring.py @@ -9,14 +9,9 @@ class TestScanstring: scanstring('"z\\ud834\\udd20x"', 1, True), ('z\U0001d120x', 16)) - if sys.maxunicode == 65535: - self.assertEqual( - scanstring('"z\U0001d120x"', 1, True), - ('z\U0001d120x', 6)) - else: - self.assertEqual( - scanstring('"z\U0001d120x"', 1, True), - ('z\U0001d120x', 5)) + self.assertEqual( + scanstring('"z\U0001d120x"', 1, True), + ('z\U0001d120x', 5)) self.assertEqual( scanstring('"\\u007b"', 1, True), diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index c5b1e25..317ae44 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1,5 +1,6 @@ import test.support, unittest import sys, codecs, html.entities, unicodedata +import ctypes class PosReturn: # this can be used for configurable callbacks @@ -577,8 +578,10 @@ class CodecCallbackTest(unittest.TestCase): UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), ("\\uffff", 1) ) - # 1 on UCS-4 builds, 2 on UCS-2 - len_wide = len("\U00010000") + if ctypes.sizeof(ctypes.c_wchar) == 2: + len_wide = 2 + else: + len_wide = 1 self.assertEqual( codecs.backslashreplace_errors( UnicodeEncodeError("ascii", "\U00010000", diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 7a9e38c..17038cb 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -622,6 +622,10 @@ class UTF8Test(ReadTest): b"abc\xed\xa0\x80def") self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), "abc\ud800def") + self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"), + b"\xf0\x90\xbf\xbf\xed\xa0\x80") + self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"), + "\U00010fff\uD800") self.assertTrue(codecs.lookup_error("surrogatepass")) class UTF7Test(ReadTest): diff --git a/Lib/test/test_peepholer.py b/Lib/test/test_peepholer.py index e0e3f63..1e782cf 100644 --- a/Lib/test/test_peepholer.py +++ b/Lib/test/test_peepholer.py @@ -218,10 +218,6 @@ class TestTranforms(unittest.TestCase): # out of range asm = dis_single('"fuu"[10]') self.assertIn('BINARY_SUBSCR', asm) - # non-BMP char (see #5057) - asm = dis_single('"\U00012345"[0]') - self.assertIn('BINARY_SUBSCR', asm) - def test_folding_of_unaryops_on_constants(self): for line, elem in ( diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index e3d10f7..d23c49b 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -780,6 +780,13 @@ class ReTests(unittest.TestCase): self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) self.assertRaises(TypeError, _sre.compile, {}, 0, []) + def test_search_dot_unicode(self): + self.assertIsNotNone(re.search("123.*-", '123abc-')) + self.assertIsNotNone(re.search("123.*-", '123\xe9-')) + self.assertIsNotNone(re.search("123.*-", '123\u20ac-')) + self.assertIsNotNone(re.search("123.*-", '123\U0010ffff-')) + self.assertIsNotNone(re.search("123.*-", '123\xe9\u20ac\U0010ffff-')) + def run_re_tests(): from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR if verbose: diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 4355ef5..ed96d76 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -833,13 +833,39 @@ class SizeofTest(unittest.TestCase): class newstyleclass(object): pass check(newstyleclass, s) # unicode - usize = len('\0'.encode('unicode-internal')) - samples = ['', '1'*100] - # we need to test for both sizes, because we don't know if the string - # has been cached + # each tuple contains a string and its expected character size + # don't put any static strings here, as they may contain + # wchar_t or UTF-8 representations + samples = ['1'*100, '\xff'*50, + '\u0100'*40, '\uffff'*100, + '\U00010000'*30, '\U0010ffff'*100] + asciifields = h + "PPiP" + compactfields = asciifields + "PPP" + unicodefields = compactfields + "P" for s in samples: - basicsize = size(h + 'PPPiP') + usize * (len(s) + 1) - check(s, basicsize) + maxchar = ord(max(s)) + if maxchar < 128: + L = size(asciifields) + len(s) + 1 + elif maxchar < 256: + L = size(compactfields) + len(s) + 1 + elif maxchar < 65536: + L = size(compactfields) + 2*(len(s) + 1) + else: + L = size(compactfields) + 4*(len(s) + 1) + check(s, L) + # verify that the UTF-8 size is accounted for + s = chr(0x4000) # 4 bytes canonical representation + check(s, size(compactfields) + 4) + try: + # FIXME: codecs.lookup(str) calls encoding.search_function() which + # calls __import__ using str in the module name. __import__ encodes + # the module name to the file system encoding (which is the locale + # encoding), so test_sys fails if the locale encoding is not UTF-8. + codecs.lookup(s) # produces 4 bytes UTF-8 + except LookupError: + check(s, size(compactfields) + 4 + 4) + # TODO: add check that forces the presence of wchar_t representation + # TODO: add check that forces layout of unicodefields # weakref import weakref check(weakref.ref(int), size(h + '2Pl2P')) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index f7424c0..f256ba6 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1583,16 +1583,32 @@ class UnicodeTest(string_tests.CommonTest, self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize) def test_raiseMemError(self): - # Ensure that the freelist contains a consistent object, even - # when a string allocation fails with a MemoryError. - # This used to crash the interpreter, - # or leak references when the number was smaller. - charwidth = 4 if sys.maxunicode >= 0x10000 else 2 - # Note: sys.maxsize is half of the actual max allocation because of - # the signedness of Py_ssize_t. - alloc = lambda: "a" * (sys.maxsize // charwidth * 2) - self.assertRaises(MemoryError, alloc) - self.assertRaises(MemoryError, alloc) + if struct.calcsize('P') == 8: + # 64 bits pointers + ascii_struct_size = 64 + compact_struct_size = 88 + else: + # 32 bits pointers + ascii_struct_size = 32 + compact_struct_size = 44 + + for char in ('a', '\xe9', '\u20ac', '\U0010ffff'): + code = ord(char) + if code < 0x100: + char_size = 1 # sizeof(Py_UCS1) + struct_size = ascii_struct_size + elif code < 0x10000: + char_size = 2 # sizeof(Py_UCS2) + struct_size = compact_struct_size + else: + char_size = 4 # sizeof(Py_UCS4) + struct_size = compact_struct_size + # Note: sys.maxsize is half of the actual max allocation because of + # the signedness of Py_ssize_t. -1 because of the null character. + maxlen = ((sys.maxsize - struct_size) // char_size) - 1 + alloc = lambda: char * maxlen + self.assertRaises(MemoryError, alloc) + self.assertRaises(MemoryError, alloc) def test_format_subclass(self): class S(str): @@ -1608,10 +1624,7 @@ class UnicodeTest(string_tests.CommonTest, from ctypes import (pythonapi, py_object, c_int, c_long, c_longlong, c_ssize_t, c_uint, c_ulong, c_ulonglong, c_size_t) - if sys.maxunicode == 65535: - name = "PyUnicodeUCS2_FromFormat" - else: - name = "PyUnicodeUCS4_FromFormat" + name = "PyUnicode_FromFormat" _PyUnicode_FromFormat = getattr(pythonapi, name) _PyUnicode_FromFormat.restype = py_object |