diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2011-08-22 11:08:38 (GMT) |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2011-08-22 11:08:38 (GMT) |
commit | 93e7afc5d9b248e264da5dbe97659bf37cb8eb15 (patch) | |
tree | 3382fc6a75dd52900aa747c5308009903616850f /Objects | |
parent | 3d3bc8121915968ee8e445299800914b5a46da3a (diff) | |
download | cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.zip cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.tar.gz cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.tar.bz2 |
#9200: The str.is* methods now work with strings that contain non-BMP characters even in narrow Unicode builds.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 101 |
1 files changed, 60 insertions, 41 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 77f8dd5..8567a9f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6442,6 +6442,37 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, start = 0; \ } +/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed + * by 'ptr', possibly combining surrogate pairs on narrow builds. + * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character + * that should be returned and 'end' pointing to the end of the buffer. + * ('end' is used on narrow builds to detect a lone surrogate at the + * end of the buffer that should be returned unchanged.) + * The ptr and end arguments should be side-effect free and ptr must an lvalue. + * The type of the returned char is always Py_UCS4. + * + * Note: the macro advances ptr to next char, so it might have side-effects + * (especially if used with other macros). + */ + +/* helper macros used by _Py_UNICODE_NEXT */ +#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) +#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +/* Join two surrogate characters and return a single Py_UCS4 value. */ +#define _Py_UNICODE_JOIN_SURROGATES(high, low) \ + (((((Py_UCS4)(high) & 0x03FF) << 10) | \ + ((Py_UCS4)(low) & 0x03FF)) + 0x10000) + +#ifdef Py_UNICODE_WIDE +#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ +#else +#define _Py_UNICODE_NEXT(ptr, end) \ + (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ + _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ + ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ + (Py_UCS4)*(ptr)++) +#endif + Py_ssize_t PyUnicode_Count(PyObject *str, PyObject *substr, Py_ssize_t start, @@ -7705,8 +7736,8 @@ unicode_islower(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7739,8 +7770,8 @@ unicode_isupper(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7777,8 +7808,8 @@ unicode_istitle(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; previous_is_cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { if (previous_is_cased) @@ -7820,8 +7851,9 @@ unicode_isspace(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISSPACE(*p)) + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); + if (!Py_UNICODE_ISSPACE(ch)) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7849,8 +7881,8 @@ unicode_isalpha(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALPHA(*p)) + while (p < e) { + if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7878,8 +7910,9 @@ unicode_isalnum(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALNUM(*p)) + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); + if (!Py_UNICODE_ISALNUM(ch)) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7907,8 +7940,8 @@ unicode_isdecimal(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDECIMAL(*p)) + while (p < e) { + if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7936,8 +7969,8 @@ unicode_isdigit(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDIGIT(*p)) + while (p < e) { + if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7965,37 +7998,22 @@ unicode_isnumeric(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISNUMERIC(*p)) + while (p < e) { + if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); } -static Py_UCS4 -decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size) -{ - Py_UCS4 ch; - assert(*i < size); - ch = s[(*i)++]; -#ifndef Py_UNICODE_WIDE - if ((ch & 0xfffffc00) == 0xd800 && - *i < size - && (s[*i] & 0xFFFFFC00) == 0xDC00) - ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00; -#endif - return ch; -} - int PyUnicode_IsIdentifier(PyObject *self) { - Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self); - Py_UCS4 first; const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); + const Py_UNICODE *e; + Py_UCS4 first; /* Special case for empty strings */ - if (!size) + if (PyUnicode_GET_SIZE(self) == 0) return 0; /* PEP 3131 says that the first character must be in @@ -8006,12 +8024,13 @@ PyUnicode_IsIdentifier(PyObject *self) definition of XID_Start and XID_Continue, it is sufficient to check just for these, except that _ must be allowed as starting an identifier. */ - first = decode_ucs4(p, &i, size); + e = p + PyUnicode_GET_SIZE(self); + first = _Py_UNICODE_NEXT(p, e); if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) return 0; - while (i < size) - if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size))) + while (p < e) + if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e))) return 0; return 1; } @@ -8046,8 +8065,8 @@ unicode_isprintable(PyObject *self) } e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISPRINTABLE(*p)) { + while (p < e) { + if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) { Py_RETURN_FALSE; } } |