diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2020-05-12 09:42:04 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-05-12 09:42:04 (GMT) |
commit | 74ea6b5a7501fb393cd567fb21998d0bfeeb267c (patch) | |
tree | 6f82cb1ae91f9cc21e0181f7284039b7d58d0309 /Objects | |
parent | f3a5b7ada0c951f317dbd307de4b410e58d3e1b3 (diff) | |
download | cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.zip cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.gz cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.bz2 |
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 64 |
1 files changed, 41 insertions, 23 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18b9458..276547c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self) Py_RETURN_TRUE; } -int -PyUnicode_IsIdentifier(PyObject *self) +Py_ssize_t +_PyUnicode_ScanIdentifier(PyObject *self) { Py_ssize_t i; - int ready = PyUnicode_IS_READY(self); + if (PyUnicode_READY(self) == -1) + return -1; - Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); if (len == 0) { /* an empty string is not a valid identifier */ return 0; } - int kind = 0; - const void *data = NULL; - const wchar_t *wstr = NULL; - Py_UCS4 ch; - if (ready) { - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); - ch = PyUnicode_READ(kind, data, 0); - } - else { - wstr = _PyUnicode_WSTR(self); - ch = wstr[0]; - } + int kind = PyUnicode_KIND(self); + const void *data = PyUnicode_DATA(self); + Py_UCS4 ch = PyUnicode_READ(kind, data, 0); /* PEP 3131 says that the first character must be in XID_Start and subsequent characters in XID_Continue, and for the ASCII range, the 2.x rules apply (i.e @@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self) } for (i = 1; i < len; i++) { - if (ready) { - ch = PyUnicode_READ(kind, data, i); + ch = PyUnicode_READ(kind, data, i); + if (!_PyUnicode_IsXidContinue(ch)) { + return i; } - else { - ch = wstr[i]; + } + return i; +} + +int +PyUnicode_IsIdentifier(PyObject *self) +{ + if (PyUnicode_IS_READY(self)) { + Py_ssize_t i = _PyUnicode_ScanIdentifier(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + /* an empty string is not a valid identifier */ + return len && i == len; + } + else { + Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + if (len == 0) { + /* an empty string is not a valid identifier */ + return 0; } - if (!_PyUnicode_IsXidContinue(ch)) { + + const wchar_t *wstr = _PyUnicode_WSTR(self); + Py_UCS4 ch = wstr[0]; + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; } + + for (i = 1; i < len; i++) { + ch = wstr[i]; + if (!_PyUnicode_IsXidContinue(ch)) { + return 0; + } + } + return 1; } - return 1; } /*[clinic input] |