diff options
author | Marc-André Lemburg <mal@egenix.com> | 2001-07-20 17:39:11 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2001-07-20 17:39:11 (GMT) |
commit | 6c6bfb7c70d77b52354a6fd8c76de2cc641aa8fe (patch) | |
tree | 78df5b177cd555fcaef467117c6d04d1de021215 | |
parent | 0d42e0c54a3b95aec4d4d12d1cd758438d645089 (diff) | |
download | cpython-6c6bfb7c70d77b52354a6fd8c76de2cc641aa8fe.zip cpython-6c6bfb7c70d77b52354a6fd8c76de2cc641aa8fe.tar.gz cpython-6c6bfb7c70d77b52354a6fd8c76de2cc641aa8fe.tar.bz2 |
Make the unicode-escape and the UTF-16 codecs handle surrogates
correctly and thus roundtrip-safe.
Some minor cleanups of the code.
Added tests for the roundtrip-safety.
-rw-r--r-- | Lib/test/test_unicode.py | 8 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 70 |
2 files changed, 54 insertions, 24 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index c9732d6..eb74854 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -445,11 +445,19 @@ verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000') verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o') verify(u'hello'.encode('latin-1') == 'hello') +# Roundtrip safety for BMP (just the first 1024 chars) u = u''.join(map(unichr, range(1024))) for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): verify(unicode(u.encode(encoding),encoding) == u) +# Roundtrip safety for non-BMP (just a few chars) +u = u'\U00010001\U00020002\U00030003\U00040004\U00050005' +for encoding in ('utf-8', + 'utf-16', 'utf-16-le', 'utf-16-be', + 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): + verify(unicode(u.encode(encoding),encoding) == u) + u = u''.join(map(unichr, range(256))) for encoding in ( 'latin-1', diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a46df16..172c61c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -104,7 +104,7 @@ static PyUnicodeObject *unicode_latin1[256]; static char unicode_default_encoding[100]; Py_UNICODE -PyUnicode_GetMax() +PyUnicode_GetMax(void) { #ifdef Py_UNICODE_WIDE return 0x10FFFF; @@ -1081,17 +1081,12 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, #endif if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { #ifndef Py_UNICODE_WIDE - /* This is valid data (a UTF-16 surrogate pair), but - we are not able to store this information since our - Py_UNICODE type only has 16 bits... this might - change someday, even though it's unlikely. */ - errmsg = "code pairs are not supported"; - goto utf16Error; + *p++ = ch; + *p++ = ch2; #else *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; - continue; #endif - + continue; } else { errmsg = "illegal UTF-16 surrogate"; @@ -1325,7 +1320,8 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, /* UCS-2 character */ *p++ = (Py_UNICODE) chr; else if (chr <= 0x10ffff) { - /* UCS-4 character. Either store directly, or as surrogate pair. */ + /* UCS-4 character. Either store directly, or as + surrogate pair. */ #ifdef Py_UNICODE_WIDE *p++ = chr; #else @@ -1446,24 +1442,50 @@ PyObject *unicodeescape_string(const Py_UNICODE *s, else if (ch >= 0x10000) { *p++ = '\\'; *p++ = 'U'; - *p++ = hexdigit[(ch >> 28) & 0xf]; - *p++ = hexdigit[(ch >> 24) & 0xf]; - *p++ = hexdigit[(ch >> 20) & 0xf]; - *p++ = hexdigit[(ch >> 16) & 0xf]; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[(ch >> 28) & 0x0000000F]; + *p++ = hexdigit[(ch >> 24) & 0x0000000F]; + *p++ = hexdigit[(ch >> 20) & 0x0000000F]; + *p++ = hexdigit[(ch >> 16) & 0x0000000F]; + *p++ = hexdigit[(ch >> 12) & 0x0000000F]; + *p++ = hexdigit[(ch >> 8) & 0x0000000F]; + *p++ = hexdigit[(ch >> 4) & 0x0000000F]; *p++ = hexdigit[ch & 15]; } #endif + /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ + else if (ch >= 0xD800 && ch < 0xDC00) { + Py_UNICODE ch2; + Py_UCS4 ucs; + + ch2 = *s++; + size--; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; + *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; + *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; + *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; + *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; + *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; + *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; + *p++ = hexdigit[ucs & 0x0000000F]; + continue; + } + /* Fall through: isolated surrogates are copied as-is */ + s--; + size++; + } + /* Map 16-bit characters to '\uxxxx' */ - else if (ch >= 256) { + if (ch >= 256) { *p++ = '\\'; *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; + *p++ = hexdigit[(ch >> 12) & 0x000F]; + *p++ = hexdigit[(ch >> 8) & 0x000F]; + *p++ = hexdigit[(ch >> 4) & 0x000F]; + *p++ = hexdigit[ch & 0x000F]; } /* Map special whitespace to '\t', \n', '\r' */ else if (ch == '\t') { @@ -1482,8 +1504,8 @@ PyObject *unicodeescape_string(const Py_UNICODE *s, else if (ch < ' ' || ch >= 128) { *p++ = '\\'; *p++ = 'x'; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; + *p++ = hexdigit[(ch >> 4) & 0x000F]; + *p++ = hexdigit[ch & 0x000F]; } /* Copy everything else as-is */ else |