From f3e7ea5b8c220cd63101e419d529c8563f9c6115 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Feb 2020 14:29:33 +0100 Subject: bpo-39500: Document PyUnicode_IsIdentifier() function (GH-18397) PyUnicode_IsIdentifier() does not call Py_FatalError() anymore if the string is not ready. --- Doc/c-api/unicode.rst | 10 +++++ .../C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst | 2 + Objects/unicodeobject.c | 47 +++++++++++++++------- Parser/tokenizer.c | 3 +- 4 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 96d77c4..b1787ed 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -240,6 +240,16 @@ access internal read-only data of Unicode objects: :c:func:`PyUnicode_nBYTE_DATA` family of macros. +.. c:function:: int PyUnicode_IsIdentifier(PyObject *o) + + Return ``1`` if the string is a valid identifier according to the language + definition, section :ref:`identifiers`. Return ``0`` otherwise. + + .. versionchanged:: 3.9 + The function does not call :c:func:`Py_FatalError` anymore if the string + is not ready. + + Unicode Character Properties """""""""""""""""""""""""""" diff --git a/Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst b/Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst new file mode 100644 index 0000000..2ca359f --- /dev/null +++ b/Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst @@ -0,0 +1,2 @@ +:c:func:`PyUnicode_IsIdentifier` does not call :c:func:`Py_FatalError` +anymore if the string is not ready. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fd08ddb..aa874f2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12198,22 +12198,33 @@ unicode_isnumeric_impl(PyObject *self) int PyUnicode_IsIdentifier(PyObject *self) { - int kind; - void *data; Py_ssize_t i; - Py_UCS4 first; + int ready = PyUnicode_IS_READY(self); - if (PyUnicode_READY(self) == -1) { - Py_FatalError("identifier not ready"); + Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self); + if (len == 0) { + /* an empty string is not a valid identifier */ return 0; } - /* Special case for empty strings */ - if (PyUnicode_GET_LENGTH(self) == 0) - return 0; - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); + int kind; + void *data; + wchar_t *wstr; + if (ready) { + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + } + else { + wstr = _PyUnicode_WSTR(self); + } + Py_UCS4 ch; + if (ready) { + ch = PyUnicode_READ(kind, data, 0); + } + else { + ch = wstr[0]; + } /* PEP 3131 says that the first character must be in XID_Start and subsequent characters in XID_Continue, and for the ASCII range, the 2.x rules apply (i.e @@ -12222,13 +12233,21 @@ PyUnicode_IsIdentifier(PyObject *self) definition of XID_Start and XID_Continue, it is sufficient to check just for these, except that _ must be allowed as starting an identifier. */ - first = PyUnicode_READ(kind, data, 0); - if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; + } - for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) - if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) + for (i = 1; i < len; i++) { + if (ready) { + ch = PyUnicode_READ(kind, data, i); + } + else { + ch = wstr[i]; + } + if (!_PyUnicode_IsXidContinue(ch)) { return 0; + } + } return 1; } diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index f73c326..c37cd92 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1079,8 +1079,9 @@ verify_identifier(struct tok_state *tok) } result = PyUnicode_IsIdentifier(s); Py_DECREF(s); - if (result == 0) + if (result == 0) { tok->done = E_IDENTIFIER; + } return result; } -- cgit v0.12