diff options
-rw-r--r-- | Lib/test/output/test_unicodedata | 2 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 56 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 23 | ||||
-rw-r--r-- | Python/import.c | 21 |
4 files changed, 71 insertions, 31 deletions
diff --git a/Lib/test/output/test_unicodedata b/Lib/test/output/test_unicodedata index 3bd5575..c2c3d7c 100644 --- a/Lib/test/output/test_unicodedata +++ b/Lib/test/output/test_unicodedata @@ -1,5 +1,5 @@ test_unicodedata Testing Unicode Database... -Methods: 6c7a7c02657b69d0fdd7a7d174f573194bba2e18 +Methods: 84b72943b1d4320bc1e64a4888f7cdf62eea219a Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a API: ok diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 5368f6e..56f1811 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -23,21 +23,23 @@ if not sys.platform.startswith('java'): verify(repr(u"'\"") == """u'\\'"'""") verify(repr(u"'") == '''u"'"''') verify(repr(u'"') == """u'"'""") - verify(repr(u''.join(map(unichr, range(256)))) == - "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" - "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" - "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" - "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" - "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" - "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" - "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9" - "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7" - "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5" - "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3" - "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1" - "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef" - "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd" - "\\xfe\\xff'") + latin1repr = ( + "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" + "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" + "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" + "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" + "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" + "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" + "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9" + "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7" + "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5" + "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3" + "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1" + "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef" + "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd" + "\\xfe\\xff'") + testrepr = repr(u''.join(map(unichr, range(256)))) + verify(testrepr == latin1repr) def test(method, input, output, *args): if verbose: @@ -495,6 +497,7 @@ else: verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd') # UTF-8 specific encoding tests: +verify(u''.encode('utf-8') == '') verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac') verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82') verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96') @@ -552,14 +555,7 @@ for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): verify(unicode(u.encode(encoding),encoding) == u) -# Roundtrip safety for non-BMP (just a few chars) -u = u'\U00010001\U00020002\U00030003\U00040004\U00050005' -for encoding in ('utf-8', - 'utf-16', 'utf-16-le', 'utf-16-be', - #'raw_unicode_escape', - 'unicode_escape', 'unicode_internal'): - verify(unicode(u.encode(encoding),encoding) == u) - +# Roundtrip safety for BMP (just the first 256 chars) u = u''.join(map(unichr, range(256))) for encoding in ( 'latin-1', @@ -571,6 +567,7 @@ for encoding in ( except ValueError,why: print '*** codec for "%s" failed: %s' % (encoding, why) +# Roundtrip safety for BMP (just the first 128 chars) u = u''.join(map(unichr, range(128))) for encoding in ( 'ascii', @@ -582,6 +579,19 @@ for encoding in ( except ValueError,why: print '*** codec for "%s" failed: %s' % (encoding, why) +# Roundtrip safety for non-BMP (just a few chars) +u = u'\U00010001\U00020002\U00030003\U00040004\U00050005' +for encoding in ('utf-8', + 'utf-16', 'utf-16-le', 'utf-16-be', + #'raw_unicode_escape', + 'unicode_escape', 'unicode_internal'): + verify(unicode(u.encode(encoding),encoding) == u) + +# UTF-8 must be roundtrip safe for all UCS-2 code points +u = u''.join(map(unichr, range(0x10000))) +for encoding in ('utf-8',): + verify(unicode(u.encode(encoding),encoding) == u) + print 'done.' print 'Testing standard mapping codecs...', diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fb9b81f..1d0508c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1065,12 +1065,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); - if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { + if (ch < 0x0800) { + /* Note: UTF-8 encodings of surrogates are considered + legal UTF-8 sequences; + + XXX For wide builds (UCS-4) we should probably try + to recombine the surrogates into a single code + unit. + */ errmsg = "illegal encoding"; goto utf8Error; } else - *p++ = (Py_UNICODE)ch; + *p++ = (Py_UNICODE)ch; break; case 4: @@ -1084,9 +1091,9 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); /* validate and convert to UTF-16 */ if ((ch < 0x10000) /* minimum value allowed for 4 - byte encoding */ + byte encoding */ || (ch > 0x10ffff)) /* maximum value allowed for - UTF-16 */ + UTF-16 */ { errmsg = "illegal encoding"; goto utf8Error; @@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, unsigned int cbWritten = 0; int i = 0; + /* Short-cut for emtpy strings */ + if (size == 0) + return PyString_FromStringAndSize(NULL, 0); + + /* We allocate 4 more bytes to have room for at least one full + UTF-8 sequence; saves a few cycles in the loop below */ v = PyString_FromStringAndSize(NULL, cbAllocated + 4); if (v == NULL) return NULL; - if (size == 0) - return v; p = PyString_AS_STRING(v); while (i < size) { diff --git a/Python/import.c b/Python/import.c index ba7d5d5..e657b70 100644 --- a/Python/import.c +++ b/Python/import.c @@ -41,8 +41,27 @@ extern time_t PyOS_GetLastModificationTime(char *, FILE *); the Unicode -U option is in use. IMO (Tim's), that's a Bad Idea (quite apart from that the -U option doesn't work so isn't used anyway). + + XXX MAL, 2002-02-07: I had to modify the MAGIC due to a fix of the + UTF-8 encoder (it previously produced invalid UTF-8 for unpaired + high surrogates), so I simply bumped the month value to 20 (invalid + month) and set the day to 1. This should be recognizable by any + algorithm relying on the above scheme. Perhaps we should simply + start counting in increments of 10 from now on ?! + + Known values: + Python 1.5: 20121 + Python 1.5.1: 20121 + Python 1.5.2: 20121 + Python 2.0: 50823 + Python 2.0.1: 50823 + Python 2.1: 60202 + Python 2.1.1: 60202 + Python 2.1.2: 60202 + Python 2.2: 60717 + Python 2.3a0: 62001 */ -#define MAGIC (60717 | ((long)'\r'<<16) | ((long)'\n'<<24)) +#define MAGIC (62001 | ((long)'\r'<<16) | ((long)'\n'<<24)) /* Magic word as global; note that _PyImport_Init() can change the value of this global to accommodate for alterations of how the |