diff options
-rw-r--r-- | Lib/test/test_unicode.py | 7 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst | 2 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 26 |
3 files changed, 31 insertions, 4 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 2839889..2ee4e64 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -720,6 +720,13 @@ class UnicodeTest(string_tests.CommonTest, self.assertFalse("©".isidentifier()) self.assertFalse("0".isidentifier()) + @support.cpython_only + def test_isidentifier_legacy(self): + import _testcapi + u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊' + self.assertTrue(u.isidentifier()) + self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier()) + def test_isprintable(self): self.assertTrue("".isprintable()) self.assertTrue(" ".isprintable()) diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst new file mode 100644 index 0000000..1252db4 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst @@ -0,0 +1,2 @@ +Fixed :meth:`str.isidentifier` for non-canonicalized strings containing +non-BMP characters on Windows. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 276547c..826298c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self) return len && i == len; } else { - Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self); if (len == 0) { /* an empty string is not a valid identifier */ return 0; } const wchar_t *wstr = _PyUnicode_WSTR(self); - Py_UCS4 ch = wstr[0]; + Py_UCS4 ch = wstr[i++]; +#if SIZEOF_WCHAR_T == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); + i++; + } +#endif if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; } - for (i = 1; i < len; i++) { - ch = wstr[i]; + while (i < len) { + ch = wstr[i++]; +#if SIZEOF_WCHAR_T == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); + i++; + } +#endif if (!_PyUnicode_IsXidContinue(ch)) { return 0; } |