summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2010-09-09 20:30:23 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2010-09-09 20:30:23 (GMT)
commite4a189274f3d88d64d5238bf340cec96eff4e5e0 (patch)
tree5ead5f4f2fe3799a34155f2e41a04518adb995b1 /Lib
parentea99c5c94985c21d8a64c9a3d753bde7f801c14a (diff)
downloadcpython-e4a189274f3d88d64d5238bf340cec96eff4e5e0.zip
cpython-e4a189274f3d88d64d5238bf340cec96eff4e5e0.tar.gz
cpython-e4a189274f3d88d64d5238bf340cec96eff4e5e0.tar.bz2
Issue #9804: ascii() now always represents unicode surrogate pairs as
a single `\UXXXXXXXX`, regardless of whether the character is printable or not. Also, the "backslashreplace" error handler now joins surrogate pairs into a single character on UCS-2 builds.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/test/test_builtin.py22
-rw-r--r--Lib/test/test_codeccallbacks.py36
2 files changed, 47 insertions, 11 deletions
diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py
index 4e09ca5..35b652b 100644
--- a/Lib/test/test_builtin.py
+++ b/Lib/test/test_builtin.py
@@ -179,6 +179,28 @@ class BuiltinTest(unittest.TestCase):
a = {}
a[0] = a
self.assertEqual(ascii(a), '{0: {...}}')
+ # Advanced checks for unicode strings
+ def _check_uni(s):
+ self.assertEqual(ascii(s), repr(s))
+ _check_uni("'")
+ _check_uni('"')
+ _check_uni('"\'')
+ _check_uni('\0')
+ _check_uni('\r\n\t .')
+ # Unprintable non-ASCII characters
+ _check_uni('\x85')
+ _check_uni('\u1fff')
+ _check_uni('\U00012fff')
+ # Lone surrogates
+ _check_uni('\ud800')
+ _check_uni('\udfff')
+ # Issue #9804: surrogates should be joined even for printable
+ # wide characters (UCS-2 builds).
+ self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
+ # All together
+ s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
+ self.assertEqual(ascii(s),
+ r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
def test_neg(self):
x = -sys.maxsize-1
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 82782b5..6105fc0 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -577,17 +577,31 @@ class CodecCallbackTest(unittest.TestCase):
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
("\\uffff", 1)
)
- if sys.maxunicode>0xffff:
- self.assertEquals(
- codecs.backslashreplace_errors(
- UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")),
- ("\\U00010000", 1)
- )
- self.assertEquals(
- codecs.backslashreplace_errors(
- UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")),
- ("\\U0010ffff", 1)
- )
+ # 1 on UCS-4 builds, 2 on UCS-2
+ len_wide = len("\U00010000")
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\U00010000",
+ 0, len_wide, "ouch")),
+ ("\\U00010000", len_wide)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\U0010ffff",
+ 0, len_wide, "ouch")),
+ ("\\U0010ffff", len_wide)
+ )
+ # Lone surrogates (regardless of unicode width)
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
+ ("\\ud800", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
+ ("\\udfff", 1)
+ )
def test_badhandlerresults(self):
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )