#9200: The str.is* methods now work with strings that contain non-BMP characters even in narrow Unicode builds.

author: Ezio Melotti <ezio.melotti@gmail.com> 2011-08-22 11:08:38 (GMT)
committer: Ezio Melotti <ezio.melotti@gmail.com> 2011-08-22 11:08:38 (GMT)
commit: 93e7afc5d9b248e264da5dbe97659bf37cb8eb15 (patch)
tree: 3382fc6a75dd52900aa747c5308009903616850f /Objects
parent: 3d3bc8121915968ee8e445299800914b5a46da3a (diff)
download: cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.zip
cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.tar.gz
cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.tar.bz2
1 files changed, 60 insertions, 41 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 77f8dd5..8567a9f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6442,6 +6442,37 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
             start = 0;                          \
     }
 
+/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
+ * by 'ptr', possibly combining surrogate pairs on narrow builds.
+ * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
+ * that should be returned and 'end' pointing to the end of the buffer.
+ * ('end' is used on narrow builds to detect a lone surrogate at the
+ * end of the buffer that should be returned unchanged.)
+ * The ptr and end arguments should be side-effect free and ptr must an lvalue.
+ * The type of the returned char is always Py_UCS4.
+ *
+ * Note: the macro advances ptr to next char, so it might have side-effects
+ *       (especially if used with other macros).
+ */
+
+/* helper macros used by _Py_UNICODE_NEXT */
+#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
+#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+/* Join two surrogate characters and return a single Py_UCS4 value. */
+#define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
+    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
+      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
+
+#ifdef Py_UNICODE_WIDE
+#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
+#else
+#define _Py_UNICODE_NEXT(ptr, end)                                      \
+     (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
+        _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
+       ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
+       (Py_UCS4)*(ptr)++)
+#endif
+
 Py_ssize_t PyUnicode_Count(PyObject *str,
                            PyObject *substr,
                            Py_ssize_t start,
@@ -7705,8 +7736,8 @@ unicode_islower(PyUnicodeObject *self)
 
     e = p + PyUnicode_GET_SIZE(self);
     cased = 0;
-    for (; p < e; p++) {
-        register const Py_UNICODE ch = *p;
+    while (p < e) {
+        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
 
         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
             return PyBool_FromLong(0);
@@ -7739,8 +7770,8 @@ unicode_isupper(PyUnicodeObject *self)
 
     e = p + PyUnicode_GET_SIZE(self);
     cased = 0;
-    for (; p < e; p++) {
-        register const Py_UNICODE ch = *p;
+    while (p < e) {
+        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
 
         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
             return PyBool_FromLong(0);
@@ -7777,8 +7808,8 @@ unicode_istitle(PyUnicodeObject *self)
     e = p + PyUnicode_GET_SIZE(self);
     cased = 0;
     previous_is_cased = 0;
-    for (; p < e; p++) {
-        register const Py_UNICODE ch = *p;
+    while (p < e) {
+        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
 
         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
             if (previous_is_cased)
@@ -7820,8 +7851,9 @@ unicode_isspace(PyUnicodeObject *self)
         return PyBool_FromLong(0);
 
     e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-        if (!Py_UNICODE_ISSPACE(*p))
+    while (p < e) {
+        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
+        if (!Py_UNICODE_ISSPACE(ch))
             return PyBool_FromLong(0);
     }
     return PyBool_FromLong(1);
@@ -7849,8 +7881,8 @@ unicode_isalpha(PyUnicodeObject *self)
         return PyBool_FromLong(0);
 
     e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-        if (!Py_UNICODE_ISALPHA(*p))
+    while (p < e) {
+        if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
             return PyBool_FromLong(0);
     }
     return PyBool_FromLong(1);
@@ -7878,8 +7910,9 @@ unicode_isalnum(PyUnicodeObject *self)
         return PyBool_FromLong(0);
 
     e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-        if (!Py_UNICODE_ISALNUM(*p))
+    while (p < e) {
+        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
+        if (!Py_UNICODE_ISALNUM(ch))
             return PyBool_FromLong(0);
     }
     return PyBool_FromLong(1);
@@ -7907,8 +7940,8 @@ unicode_isdecimal(PyUnicodeObject *self)
         return PyBool_FromLong(0);
 
     e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-        if (!Py_UNICODE_ISDECIMAL(*p))
+    while (p < e) {
+        if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
             return PyBool_FromLong(0);
     }
     return PyBool_FromLong(1);
@@ -7936,8 +7969,8 @@ unicode_isdigit(PyUnicodeObject *self)
         return PyBool_FromLong(0);
 
     e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-        if (!Py_UNICODE_ISDIGIT(*p))
+    while (p < e) {
+        if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
             return PyBool_FromLong(0);
     }
     return PyBool_FromLong(1);
@@ -7965,37 +7998,22 @@ unicode_isnumeric(PyUnicodeObject *self)
         return PyBool_FromLong(0);
 
     e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-        if (!Py_UNICODE_ISNUMERIC(*p))
+    while (p < e) {
+        if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
             return PyBool_FromLong(0);
     }
     return PyBool_FromLong(1);
 }
 
-static Py_UCS4
-decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size)
-{
-    Py_UCS4 ch;
-    assert(*i < size);
-    ch = s[(*i)++];
-#ifndef Py_UNICODE_WIDE
-    if ((ch & 0xfffffc00) == 0xd800 &&
-        *i < size
-        && (s[*i] & 0xFFFFFC00) == 0xDC00)
-        ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00;
-#endif
-    return ch;
-}
-
 int
 PyUnicode_IsIdentifier(PyObject *self)
 {
-    Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self);
-    Py_UCS4 first;
     const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
+    const Py_UNICODE *e;
+    Py_UCS4 first;
 
     /* Special case for empty strings */
-    if (!size)
+    if (PyUnicode_GET_SIZE(self) == 0)
         return 0;
 
     /* PEP 3131 says that the first character must be in
@@ -8006,12 +8024,13 @@ PyUnicode_IsIdentifier(PyObject *self)
        definition of XID_Start and XID_Continue, it is sufficient
        to check just for these, except that _ must be allowed
        as starting an identifier.  */
-    first = decode_ucs4(p, &i, size);
+    e = p + PyUnicode_GET_SIZE(self);
+    first = _Py_UNICODE_NEXT(p, e);
     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
         return 0;
 
-    while (i < size)
-        if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size)))
+    while (p < e)
+        if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
             return 0;
     return 1;
 }
@@ -8046,8 +8065,8 @@ unicode_isprintable(PyObject *self)
     }
 
     e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-        if (!Py_UNICODE_ISPRINTABLE(*p)) {
+    while (p < e) {
+        if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
             Py_RETURN_FALSE;
         }
     }
author	Ezio Melotti <ezio.melotti@gmail.com>	2011-08-22 11:08:38 (GMT)
committer	Ezio Melotti <ezio.melotti@gmail.com>	2011-08-22 11:08:38 (GMT)
commit	93e7afc5d9b248e264da5dbe97659bf37cb8eb15 (patch)
tree	3382fc6a75dd52900aa747c5308009903616850f /Objects
parent	3d3bc8121915968ee8e445299800914b5a46da3a (diff)
download	cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.zip cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.tar.gz cpython-93e7afc5d9b248e264da5dbe97659bf37cb8eb15.tar.bz2