diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2020-05-12 13:18:00 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-05-12 13:18:00 (GMT) |
commit | 5650e76f63a6f4ec55d00ec13f143d84a2efee39 (patch) | |
tree | 6a3179a1f7b94c3d65985b49c81a5a8fa94e327a /Objects | |
parent | 7c6e97077525f0ad3cfa0971028313b9079449fd (diff) | |
download | cpython-5650e76f63a6f4ec55d00ec13f143d84a2efee39.zip cpython-5650e76f63a6f4ec55d00ec13f143d84a2efee39.tar.gz cpython-5650e76f63a6f4ec55d00ec13f143d84a2efee39.tar.bz2 |
bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 26 |
1 files changed, 22 insertions, 4 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 276547c..826298c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self) return len && i == len; } else { - Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self); if (len == 0) { /* an empty string is not a valid identifier */ return 0; } const wchar_t *wstr = _PyUnicode_WSTR(self); - Py_UCS4 ch = wstr[0]; + Py_UCS4 ch = wstr[i++]; +#if SIZEOF_WCHAR_T == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); + i++; + } +#endif if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; } - for (i = 1; i < len; i++) { - ch = wstr[i]; + while (i < len) { + ch = wstr[i++]; +#if SIZEOF_WCHAR_T == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); + i++; + } +#endif if (!_PyUnicode_IsXidContinue(ch)) { return 0; } |