summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2020-05-12 13:18:00 (GMT)
committerGitHub <noreply@github.com>2020-05-12 13:18:00 (GMT)
commit5650e76f63a6f4ec55d00ec13f143d84a2efee39 (patch)
tree6a3179a1f7b94c3d65985b49c81a5a8fa94e327a /Objects
parent7c6e97077525f0ad3cfa0971028313b9079449fd (diff)
downloadcpython-5650e76f63a6f4ec55d00ec13f143d84a2efee39.zip
cpython-5650e76f63a6f4ec55d00ec13f143d84a2efee39.tar.gz
cpython-5650e76f63a6f4ec55d00ec13f143d84a2efee39.tar.bz2
bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c26
1 files changed, 22 insertions, 4 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 276547c..826298c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self)
return len && i == len;
}
else {
- Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
+ Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
}
const wchar_t *wstr = _PyUnicode_WSTR(self);
- Py_UCS4 ch = wstr[0];
+ Py_UCS4 ch = wstr[i++];
+#if SIZEOF_WCHAR_T == 2
+ if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
+ && i < len
+ && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
+ {
+ ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
+ i++;
+ }
+#endif
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
return 0;
}
- for (i = 1; i < len; i++) {
- ch = wstr[i];
+ while (i < len) {
+ ch = wstr[i++];
+#if SIZEOF_WCHAR_T == 2
+ if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
+ && i < len
+ && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
+ {
+ ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
+ i++;
+ }
+#endif
if (!_PyUnicode_IsXidContinue(ch)) {
return 0;
}