summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2023-10-11 13:41:58 (GMT)
committerGitHub <noreply@github.com>2023-10-11 13:41:58 (GMT)
commiteb50cd37eac47dd4dc71ab42d0582dfb6eac4515 (patch)
tree410d6c0c88cb2c223cace999b490a6adce642e8f /Objects/unicodeobject.c
parentd1f7fae424d51b0374c8204599583c4a26c1a992 (diff)
downloadcpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.zip
cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.gz
cpython-eb50cd37eac47dd4dc71ab42d0582dfb6eac4515.tar.bz2
gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297)
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c76
1 files changed, 76 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 49981a1..33cbc98 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -10674,6 +10674,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
}
int
+PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
+{
+ return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
+}
+
+int
+PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
+{
+ assert(_PyUnicode_CHECK(unicode));
+ assert(str);
+
+ if (PyUnicode_IS_ASCII(unicode)) {
+ Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+ return size == len &&
+ memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
+ }
+ if (PyUnicode_UTF8(unicode) != NULL) {
+ Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
+ return size == len &&
+ memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
+ }
+
+ Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+ if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
+ return 0;
+ }
+ const unsigned char *s = (const unsigned char *)str;
+ const unsigned char *ends = s + (size_t)size;
+ int kind = PyUnicode_KIND(unicode);
+ const void *data = PyUnicode_DATA(unicode);
+ /* Compare Unicode string and UTF-8 string */
+ for (Py_ssize_t i = 0; i < len; i++) {
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0x80) {
+ if (ends == s || s[0] != ch) {
+ return 0;
+ }
+ s += 1;
+ }
+ else if (ch < 0x800) {
+ if ((ends - s) < 2 ||
+ s[0] != (0xc0 | (ch >> 6)) ||
+ s[1] != (0x80 | (ch & 0x3f)))
+ {
+ return 0;
+ }
+ s += 2;
+ }
+ else if (ch < 0x10000) {
+ if (Py_UNICODE_IS_SURROGATE(ch) ||
+ (ends - s) < 3 ||
+ s[0] != (0xe0 | (ch >> 12)) ||
+ s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
+ s[2] != (0x80 | (ch & 0x3f)))
+ {
+ return 0;
+ }
+ s += 3;
+ }
+ else {
+ assert(ch <= MAX_UNICODE);
+ if ((ends - s) < 4 ||
+ s[0] != (0xf0 | (ch >> 18)) ||
+ s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
+ s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
+ s[3] != (0x80 | (ch & 0x3f)))
+ {
+ return 0;
+ }
+ s += 4;
+ }
+ }
+ return s == ends;
+}
+
+int
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
{
size_t len;