diff options
author | Victor Stinner <victor.stinner@haypocalc.com> | 2011-12-08 21:14:11 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@haypocalc.com> | 2011-12-08 21:14:11 (GMT) |
commit | 8faf8216e42d78d53e92e3f3035384ecfa809d12 (patch) | |
tree | 8ae3f0e2a3ffaf073684893eacd6a04651357278 /Objects/unicodeobject.c | |
parent | bc9f0c68f5fc2376817206354833af101985e3e2 (diff) | |
download | cpython-8faf8216e42d78d53e92e3f3035384ecfa809d12.zip cpython-8faf8216e42d78d53e92e3f3035384ecfa809d12.tar.gz cpython-8faf8216e42d78d53e92e3f3035384ecfa809d12.tar.bz2 |
PyUnicode_FromWideChar() and PyUnicode_FromUnicode() raise a ValueError if a
character in not in range [U+0000; U+10ffff].
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 67 |
1 files changed, 34 insertions, 33 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 348a83b..9670ae8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -66,6 +66,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. extern "C" { #endif +/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ +#define MAX_UNICODE 0x10ffff + #ifdef Py_DEBUG # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) #else @@ -393,9 +396,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) } else { assert(maxchar >= 0x10000); - /* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm() - return characters outside the range U+0000-U+10FFFF. */ - /* assert(maxchar <= 0x10FFFF); */ + assert(maxchar <= MAX_UNICODE); } } return 1; @@ -1295,36 +1296,37 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) { const wchar_t *iter; + Py_UCS4 ch; assert(num_surrogates != NULL && maxchar != NULL); *num_surrogates = 0; *maxchar = 0; for (iter = begin; iter < end; ) { - if (*iter > *maxchar) { - *maxchar = *iter; -#if SIZEOF_WCHAR_T != 2 - if (*maxchar >= 0x10000) - return 0; -#endif - } #if SIZEOF_WCHAR_T == 2 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) && (iter+1) < end && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) { - Py_UCS4 surrogate_val; - surrogate_val = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); + ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); ++(*num_surrogates); - if (surrogate_val > *maxchar) - *maxchar = surrogate_val; iter += 2; } else - iter++; -#else - iter++; #endif + { + ch = *iter; + iter++; + } + if (ch > *maxchar) { + *maxchar = ch; + if (*maxchar > MAX_UNICODE) { + PyErr_Format(PyExc_ValueError, + "character U+%x is not in range [U+0000; U+10ffff]", + ch); + return -1; + } + } } return 0; } @@ -1669,8 +1671,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) &maxchar, &num_surrogates) == -1) return NULL; - unicode = PyUnicode_New(size - num_surrogates, - maxchar); + unicode = PyUnicode_New(size - num_surrogates, maxchar); if (!unicode) return NULL; @@ -1808,7 +1809,7 @@ kind_maxchar_limit(unsigned int kind) return 0x10000; default: assert(0 && "invalid kind"); - return 0x10ffff; + return MAX_UNICODE; } } @@ -2796,7 +2797,7 @@ PyObject * PyUnicode_FromOrdinal(int ordinal) { PyObject *v; - if (ordinal < 0 || ordinal > 0x10ffff) { + if (ordinal < 0 || ordinal > MAX_UNICODE) { PyErr_SetString(PyExc_ValueError, "chr() arg not in range(0x110000)"); return NULL; @@ -3472,7 +3473,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) four_bytes = PyUnicode_4BYTE_DATA(unicode); for (; four_bytes < ucs4_end; ++four_bytes, ++w) { if (*four_bytes > 0xFFFF) { - assert(*four_bytes <= 0x10FFFF); + assert(*four_bytes <= MAX_UNICODE); /* encode surrogate pair in this case */ *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); @@ -4118,7 +4119,7 @@ _PyUnicode_EncodeUTF7(PyObject *str, continue; encode_char: if (ch >= 0x10000) { - assert(ch <= 0x10FFFF); + assert(ch <= MAX_UNICODE); /* code first surrogate */ base64bits += 16; @@ -4577,7 +4578,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, } ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); - assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); + assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); WRITE_MAYBE_FAIL(i++, ch); break; @@ -4714,7 +4715,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) } ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); - assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); + assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); #if SIZEOF_WCHAR_T == 4 *p++ = (wchar_t)ch; @@ -4884,7 +4885,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); } else /* ch >= 0x10000 */ { - assert(ch <= 0x10FFFF); + assert(ch <= MAX_UNICODE); /* Encode UCS4 Unicode ordinals */ *p++ = (char)(0xf0 | (ch >> 18)); *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); @@ -5792,7 +5793,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s, break; store: /* when we get here, chr is a 32-bit unicode character */ - if (chr <= 0x10ffff) { + if (chr <= MAX_UNICODE) { WRITECHAR(chr); } else { endinpos = s-starts; @@ -5957,7 +5958,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map 21-bit characters to '\U00xxxxxx' */ else if (ch >= 0x10000) { - assert(ch <= 0x10FFFF); + assert(ch <= MAX_UNICODE); *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; @@ -6108,7 +6109,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, else x += 10 + c - 'A'; } - if (x <= 0x10ffff) { + if (x <= MAX_UNICODE) { if (unicode_putchar(&v, &outpos, x) < 0) goto onError; } else { @@ -6175,7 +6176,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* Map 32-bit characters to '\Uxxxxxxxx' */ if (ch >= 0x10000) { - assert(ch <= 0x10FFFF); + assert(ch <= MAX_UNICODE); *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0xf]; @@ -6536,7 +6537,7 @@ unicode_encode_ucs1(PyObject *unicode, else if (ch < 1000000) repsize += 2+6+1; else { - assert(ch <= 0x10FFFF); + assert(ch <= MAX_UNICODE); repsize += 2+7+1; } } @@ -9275,7 +9276,7 @@ fixup(PyObject *self, else if (maxchar_new <= 65535) maxchar_new = 65535; else - maxchar_new = 1114111; /* 0x10ffff */ + maxchar_new = MAX_UNICODE; if (!maxchar_new && PyUnicode_CheckExact(self)) { /* fixfct should return TRUE if it modified the buffer. If @@ -13059,7 +13060,7 @@ formatchar(PyObject *v) if (x == -1 && PyErr_Occurred()) goto onError; - if (x < 0 || x > 0x10ffff) { + if (x < 0 || x > MAX_UNICODE) { PyErr_SetString(PyExc_OverflowError, "%c arg not in range(0x110000)"); return (Py_UCS4) -1; |