gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297)

author: Serhiy Storchaka <storchaka@gmail.com> 2023-10-11 13:41:58 (GMT)
committer: GitHub <noreply@github.com> 2023-10-11 13:41:58 (GMT)
commit: eb50cd37eac47dd4dc71ab42d0582dfb6eac4515 (patch)
tree: 410d6c0c88cb2c223cace999b490a6adce642e8f /Objects/unicodeobject.c
parent: d1f7fae424d51b0374c8204599583c4a26c1a992 (diff)
download: cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.zip
cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.gz
cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.bz2
1 files changed, 76 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 49981a1..33cbc98 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -10674,6 +10674,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
 }
 
 int
+PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
+{
+    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
+}
+
+int
+PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
+{
+    assert(_PyUnicode_CHECK(unicode));
+    assert(str);
+
+    if (PyUnicode_IS_ASCII(unicode)) {
+        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+        return size == len &&
+            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
+    }
+    if (PyUnicode_UTF8(unicode) != NULL) {
+        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
+        return size == len &&
+            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
+    }
+
+    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
+        return 0;
+    }
+    const unsigned char *s = (const unsigned char *)str;
+    const unsigned char *ends = s + (size_t)size;
+    int kind = PyUnicode_KIND(unicode);
+    const void *data = PyUnicode_DATA(unicode);
+    /* Compare Unicode string and UTF-8 string */
+    for (Py_ssize_t i = 0; i < len; i++) {
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+        if (ch < 0x80) {
+            if (ends == s || s[0] != ch) {
+                return 0;
+            }
+            s += 1;
+        }
+        else if (ch < 0x800) {
+            if ((ends - s) < 2 ||
+                s[0] != (0xc0 | (ch >> 6)) ||
+                s[1] != (0x80 | (ch & 0x3f)))
+            {
+                return 0;
+            }
+            s += 2;
+        }
+        else if (ch < 0x10000) {
+            if (Py_UNICODE_IS_SURROGATE(ch) ||
+                (ends - s) < 3 ||
+                s[0] != (0xe0 | (ch >> 12)) ||
+                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
+                s[2] != (0x80 | (ch & 0x3f)))
+            {
+                return 0;
+            }
+            s += 3;
+        }
+        else {
+            assert(ch <= MAX_UNICODE);
+            if ((ends - s) < 4 ||
+                s[0] != (0xf0 | (ch >> 18)) ||
+                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
+                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
+                s[3] != (0x80 | (ch & 0x3f)))
+            {
+                return 0;
+            }
+            s += 4;
+        }
+    }
+    return s == ends;
+}
+
+int
 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
 {
     size_t len;
author	Serhiy Storchaka <storchaka@gmail.com>	2023-10-11 13:41:58 (GMT)
committer	GitHub <noreply@github.com>	2023-10-11 13:41:58 (GMT)
commit	eb50cd37eac47dd4dc71ab42d0582dfb6eac4515 (patch)
tree	410d6c0c88cb2c223cace999b490a6adce642e8f /Objects/unicodeobject.c
parent	d1f7fae424d51b0374c8204599583c4a26c1a992 (diff)
download	cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.zip cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.gz cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.bz2