diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2023-10-11 13:41:58 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-11 13:41:58 (GMT) |
commit | eb50cd37eac47dd4dc71ab42d0582dfb6eac4515 (patch) | |
tree | 410d6c0c88cb2c223cace999b490a6adce642e8f /Objects/unicodeobject.c | |
parent | d1f7fae424d51b0374c8204599583c4a26c1a992 (diff) | |
download | cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.zip cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.gz cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.bz2 |
gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297)
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 49981a1..33cbc98 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10674,6 +10674,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } int +PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) +{ + return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str)); +} + +int +PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size) +{ + assert(_PyUnicode_CHECK(unicode)); + assert(str); + + if (PyUnicode_IS_ASCII(unicode)) { + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + return size == len && + memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; + } + if (PyUnicode_UTF8(unicode) != NULL) { + Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode); + return size == len && + memcmp(PyUnicode_UTF8(unicode), str, len) == 0; + } + + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) { + return 0; + } + const unsigned char *s = (const unsigned char *)str; + const unsigned char *ends = s + (size_t)size; + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + /* Compare Unicode string and UTF-8 string */ + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch < 0x80) { + if (ends == s || s[0] != ch) { + return 0; + } + s += 1; + } + else if (ch < 0x800) { + if ((ends - s) < 2 || + s[0] != (0xc0 | (ch >> 6)) || + s[1] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 2; + } + else if (ch < 0x10000) { + if (Py_UNICODE_IS_SURROGATE(ch) || + (ends - s) < 3 || + s[0] != (0xe0 | (ch >> 12)) || + s[1] != (0x80 | ((ch >> 6) & 0x3f)) || + s[2] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 3; + } + else { + assert(ch <= MAX_UNICODE); + if ((ends - s) < 4 || + s[0] != (0xf0 | (ch >> 18)) || + s[1] != (0x80 | ((ch >> 12) & 0x3f)) || + s[2] != (0x80 | ((ch >> 6) & 0x3f)) || + s[3] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 4; + } + } + return s == ends; +} + +int _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) { size_t len; |