7 files changed, 78 insertions, 34 deletions
diff --git a/Lib/test/json_tests/test_scanstring.py b/Lib/test/json_tests/test_scanstring.py
index f82cdee..426c8dd 100644
--- a/Lib/test/json_tests/test_scanstring.py
+++ b/Lib/test/json_tests/test_scanstring.py
@@ -9,14 +9,9 @@ class TestScanstring:
             scanstring('"z\\ud834\\udd20x"', 1, True),
             ('z\U0001d120x', 16))
 
-        if sys.maxunicode == 65535:
-            self.assertEqual(
-                scanstring('"z\U0001d120x"', 1, True),
-                ('z\U0001d120x', 6))
-        else:
-            self.assertEqual(
-                scanstring('"z\U0001d120x"', 1, True),
-                ('z\U0001d120x', 5))
+        self.assertEqual(
+            scanstring('"z\U0001d120x"', 1, True),
+            ('z\U0001d120x', 5))
 
         self.assertEqual(
             scanstring('"\\u007b"', 1, True),
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index c5b1e25..317ae44 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -1,5 +1,6 @@
 import test.support, unittest
 import sys, codecs, html.entities, unicodedata
+import ctypes
 
 class PosReturn:
     # this can be used for configurable callbacks
@@ -577,8 +578,10 @@ class CodecCallbackTest(unittest.TestCase):
                 UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
             ("\\uffff", 1)
         )
-        # 1 on UCS-4 builds, 2 on UCS-2
-        len_wide = len("\U00010000")
+        if ctypes.sizeof(ctypes.c_wchar) == 2:
+            len_wide = 2
+        else:
+            len_wide = 1
         self.assertEqual(
             codecs.backslashreplace_errors(
                 UnicodeEncodeError("ascii", "\U00010000",
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 7a9e38c..17038cb 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -622,6 +622,10 @@ class UTF8Test(ReadTest):
                          b"abc\xed\xa0\x80def")
         self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
                          "abc\ud800def")
+        self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
+                         b"\xf0\x90\xbf\xbf\xed\xa0\x80")
+        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
+                         "\U00010fff\uD800")
         self.assertTrue(codecs.lookup_error("surrogatepass"))
 
 class UTF7Test(ReadTest):
diff --git a/Lib/test/test_peepholer.py b/Lib/test/test_peepholer.py
index e0e3f63..1e782cf 100644
--- a/Lib/test/test_peepholer.py
+++ b/Lib/test/test_peepholer.py
@@ -218,10 +218,6 @@ class TestTranforms(unittest.TestCase):
         # out of range
         asm = dis_single('"fuu"[10]')
         self.assertIn('BINARY_SUBSCR', asm)
-        # non-BMP char (see #5057)
-        asm = dis_single('"\U00012345"[0]')
-        self.assertIn('BINARY_SUBSCR', asm)
-
 
     def test_folding_of_unaryops_on_constants(self):
         for line, elem in (
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index e3d10f7..d23c49b 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -780,6 +780,13 @@ class ReTests(unittest.TestCase):
         self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
         self.assertRaises(TypeError, _sre.compile, {}, 0, [])
 
+    def test_search_dot_unicode(self):
+        self.assertIsNotNone(re.search("123.*-", '123abc-'))
+        self.assertIsNotNone(re.search("123.*-", '123\xe9-'))
+        self.assertIsNotNone(re.search("123.*-", '123\u20ac-'))
+        self.assertIsNotNone(re.search("123.*-", '123\U0010ffff-'))
+        self.assertIsNotNone(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
+
 def run_re_tests():
     from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
     if verbose:
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index 4355ef5..ed96d76 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -833,13 +833,39 @@ class SizeofTest(unittest.TestCase):
         class newstyleclass(object): pass
         check(newstyleclass, s)
         # unicode
-        usize = len('\0'.encode('unicode-internal'))
-        samples = ['', '1'*100]
-        # we need to test for both sizes, because we don't know if the string
-        # has been cached
+        # each tuple contains a string and its expected character size
+        # don't put any static strings here, as they may contain
+        # wchar_t or UTF-8 representations
+        samples = ['1'*100, '\xff'*50,
+                   '\u0100'*40, '\uffff'*100,
+                   '\U00010000'*30, '\U0010ffff'*100]
+        asciifields = h + "PPiP"
+        compactfields = asciifields + "PPP"
+        unicodefields = compactfields + "P"
         for s in samples:
-            basicsize =  size(h + 'PPPiP') + usize * (len(s) + 1)
-            check(s, basicsize)
+            maxchar = ord(max(s))
+            if maxchar < 128:
+                L = size(asciifields) + len(s) + 1
+            elif maxchar < 256:
+                L = size(compactfields) + len(s) + 1
+            elif maxchar < 65536:
+                L = size(compactfields) + 2*(len(s) + 1)
+            else:
+                L = size(compactfields) + 4*(len(s) + 1)
+            check(s, L)
+        # verify that the UTF-8 size is accounted for
+        s = chr(0x4000)   # 4 bytes canonical representation
+        check(s, size(compactfields) + 4)
+        try:
+            # FIXME: codecs.lookup(str) calls encoding.search_function() which
+            # calls __import__ using str in the module name. __import__ encodes
+            # the module name to the file system encoding (which is the locale
+            # encoding), so test_sys fails if the locale encoding is not UTF-8.
+            codecs.lookup(s) # produces 4 bytes UTF-8
+        except LookupError:
+            check(s, size(compactfields) + 4 + 4)
+        # TODO: add check that forces the presence of wchar_t representation
+        # TODO: add check that forces layout of unicodefields
         # weakref
         import weakref
         check(weakref.ref(int), size(h + '2Pl2P'))
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index f7424c0..f256ba6 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1583,16 +1583,32 @@ class UnicodeTest(string_tests.CommonTest,
         self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
 
     def test_raiseMemError(self):
-        # Ensure that the freelist contains a consistent object, even
-        # when a string allocation fails with a MemoryError.
-        # This used to crash the interpreter,
-        # or leak references when the number was smaller.
-        charwidth = 4 if sys.maxunicode >= 0x10000 else 2
-        # Note: sys.maxsize is half of the actual max allocation because of
-        # the signedness of Py_ssize_t.
-        alloc = lambda: "a" * (sys.maxsize // charwidth * 2)
-        self.assertRaises(MemoryError, alloc)
-        self.assertRaises(MemoryError, alloc)
+        if struct.calcsize('P') == 8:
+            # 64 bits pointers
+            ascii_struct_size = 64
+            compact_struct_size = 88
+        else:
+            # 32 bits pointers
+            ascii_struct_size = 32
+            compact_struct_size = 44
+
+        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
+            code = ord(char)
+            if code < 0x100:
+                char_size = 1  # sizeof(Py_UCS1)
+                struct_size = ascii_struct_size
+            elif code < 0x10000:
+                char_size = 2  # sizeof(Py_UCS2)
+                struct_size = compact_struct_size
+            else:
+                char_size = 4  # sizeof(Py_UCS4)
+                struct_size = compact_struct_size
+            # Note: sys.maxsize is half of the actual max allocation because of
+            # the signedness of Py_ssize_t. -1 because of the null character.
+            maxlen = ((sys.maxsize - struct_size) // char_size) - 1
+            alloc = lambda: char * maxlen
+            self.assertRaises(MemoryError, alloc)
+            self.assertRaises(MemoryError, alloc)
 
     def test_format_subclass(self):
         class S(str):
@@ -1608,10 +1624,7 @@ class UnicodeTest(string_tests.CommonTest,
         from ctypes import (pythonapi, py_object,
             c_int, c_long, c_longlong, c_ssize_t,
             c_uint, c_ulong, c_ulonglong, c_size_t)
-        if sys.maxunicode == 65535:
-            name = "PyUnicodeUCS2_FromFormat"
-        else:
-            name = "PyUnicodeUCS4_FromFormat"
+        name = "PyUnicode_FromFormat"
         _PyUnicode_FromFormat = getattr(pythonapi, name)
         _PyUnicode_FromFormat.restype = py_object