From d6d06ade263d4a2e3d865b1a9f62e6d64eb73604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lemburg?= Date: Fri, 7 Jul 2000 17:48:52 +0000 Subject: Tests for new surrogate support in the UTF-8 codec. By Bill Tutt. --- Lib/test/test_unicode.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index ef8bd82..76a2591 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -168,6 +168,57 @@ assert 'abc' < u'abcd' assert u'abc' < u'abcd' print 'done.' +print 'Testing UTF-16 code point order comparisons...', +#No surrogates, no fixup required. +assert u'\u0061' < u'\u20ac' +# Non surrogate below surrogate value, no fixup required +assert u'\u0061' < u'\ud800\udc02' + +# Non surrogate above surrogate value, fixup required +def test_lecmp(s, s2): + assert s < s2 , "comparison failed on %s < %s" % (s, s2) + +def test_fixup(s): + s2 = u'\ud800\udc01' + test_lecmp(s, s2) + s2 = u'\ud900\udc01' + test_lecmp(s, s2) + s2 = u'\uda00\udc01' + test_lecmp(s, s2) + s2 = u'\udb00\udc01' + test_lecmp(s, s2) + s2 = u'\ud800\udd01' + test_lecmp(s, s2) + s2 = u'\ud900\udd01' + test_lecmp(s, s2) + s2 = u'\uda00\udd01' + test_lecmp(s, s2) + s2 = u'\udb00\udd01' + test_lecmp(s, s2) + s2 = u'\ud800\ude01' + test_lecmp(s, s2) + s2 = u'\ud900\ude01' + test_lecmp(s, s2) + s2 = u'\uda00\ude01' + test_lecmp(s, s2) + s2 = u'\udb00\ude01' + test_lecmp(s, s2) + s2 = u'\ud800\udfff' + test_lecmp(s, s2) + s2 = u'\ud900\udfff' + test_lecmp(s, s2) + s2 = u'\uda00\udfff' + test_lecmp(s, s2) + s2 = u'\udb00\udfff' + test_lecmp(s, s2) + +test_fixup(u'\ue000') +test_fixup(u'\uff61') + +# Surrogates on both sides, no fixup required +assert u'\ud800\udc02' < u'\ud84d\udc56' +print 'done.' + test('ljust', u'abc', u'abc ', 10) test('rjust', u'abc', u' abc', 10) test('center', u'abc', u' abc ', 10) @@ -293,6 +344,27 @@ print 'done.' # Test builtin codecs print 'Testing builtin codecs...', +# UTF-8 specific encoding tests: +assert u'\u20ac'.encode('utf-8') == \ + ''.join((chr(0xe2), chr(0x82), chr(0xac))) +assert u'\ud800\udc02'.encode('utf-8') == \ + ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) +assert u'\ud84d\udc56'.encode('utf-8') == \ + ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) +# UTF-8 specific decoding tests +assert unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))), + 'utf-8') == u'\ud84d\udc56' +assert unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))), + 'utf-8') == u'\ud800\udc02' +assert unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))), + 'utf-8') == u'\u20ac' + +# Other possible utf-8 test cases: +# * strict decoding testing for all of the +# UTF8_ERROR cases in PyUnicode_DecodeUTF8 + + + assert unicode('hello','ascii') == u'hello' assert unicode('hello','utf-8') == u'hello' assert unicode('hello','utf8') == u'hello' -- cgit v0.12