summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2020-05-12 09:42:04 (GMT)
committerGitHub <noreply@github.com>2020-05-12 09:42:04 (GMT)
commit74ea6b5a7501fb393cd567fb21998d0bfeeb267c (patch)
tree6f82cb1ae91f9cc21e0181f7284039b7d58d0309 /Objects
parentf3a5b7ada0c951f317dbd307de4b410e58d3e1b3 (diff)
downloadcpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.zip
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.gz
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.bz2
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c64
1 files changed, 41 insertions, 23 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 18b9458..276547c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self)
Py_RETURN_TRUE;
}
-int
-PyUnicode_IsIdentifier(PyObject *self)
+Py_ssize_t
+_PyUnicode_ScanIdentifier(PyObject *self)
{
Py_ssize_t i;
- int ready = PyUnicode_IS_READY(self);
+ if (PyUnicode_READY(self) == -1)
+ return -1;
- Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
}
- int kind = 0;
- const void *data = NULL;
- const wchar_t *wstr = NULL;
- Py_UCS4 ch;
- if (ready) {
- kind = PyUnicode_KIND(self);
- data = PyUnicode_DATA(self);
- ch = PyUnicode_READ(kind, data, 0);
- }
- else {
- wstr = _PyUnicode_WSTR(self);
- ch = wstr[0];
- }
+ int kind = PyUnicode_KIND(self);
+ const void *data = PyUnicode_DATA(self);
+ Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
/* PEP 3131 says that the first character must be in
XID_Start and subsequent characters in XID_Continue,
and for the ASCII range, the 2.x rules apply (i.e
@@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self)
}
for (i = 1; i < len; i++) {
- if (ready) {
- ch = PyUnicode_READ(kind, data, i);
+ ch = PyUnicode_READ(kind, data, i);
+ if (!_PyUnicode_IsXidContinue(ch)) {
+ return i;
}
- else {
- ch = wstr[i];
+ }
+ return i;
+}
+
+int
+PyUnicode_IsIdentifier(PyObject *self)
+{
+ if (PyUnicode_IS_READY(self)) {
+ Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+ /* an empty string is not a valid identifier */
+ return len && i == len;
+ }
+ else {
+ Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
+ if (len == 0) {
+ /* an empty string is not a valid identifier */
+ return 0;
}
- if (!_PyUnicode_IsXidContinue(ch)) {
+
+ const wchar_t *wstr = _PyUnicode_WSTR(self);
+ Py_UCS4 ch = wstr[0];
+ if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
return 0;
}
+
+ for (i = 1; i < len; i++) {
+ ch = wstr[i];
+ if (!_PyUnicode_IsXidContinue(ch)) {
+ return 0;
+ }
+ }
+ return 1;
}
- return 1;
}
/*[clinic input]