bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)

author: Serhiy Storchaka <storchaka@gmail.com> 2020-05-12 09:42:04 (GMT)
committer: GitHub <noreply@github.com> 2020-05-12 09:42:04 (GMT)
commit: 74ea6b5a7501fb393cd567fb21998d0bfeeb267c (patch)
tree: 6f82cb1ae91f9cc21e0181f7284039b7d58d0309 /Parser
parent: f3a5b7ada0c951f317dbd307de4b410e58d3e1b3 (diff)
download: cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.zip
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.gz
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.bz2
2 files changed, 37 insertions, 12 deletions
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c
index c80f086..5f8c862 100644
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
         case E_TOKEN:
             msg = "invalid token";
             break;
-        case E_IDENTIFIER:
-            msg = "invalid character in identifier";
-            break;
         case E_EOFS:
             RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
             return -1;
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 0f2b6af..b81fa11 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1101,25 +1101,53 @@ static int
 verify_identifier(struct tok_state *tok)
 {
     PyObject *s;
-    int result;
     if (tok->decoding_erred)
         return 0;
     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
     if (s == NULL) {
         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
-            PyErr_Clear();
-            tok->done = E_IDENTIFIER;
-        } else {
+            tok->done = E_DECODE;
+        }
+        else {
             tok->done = E_ERROR;
         }
         return 0;
     }
-    result = PyUnicode_IsIdentifier(s);
-    Py_DECREF(s);
-    if (result == 0) {
-        tok->done = E_IDENTIFIER;
+    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
+    if (invalid < 0) {
+        Py_DECREF(s);
+        tok->done = E_ERROR;
+        return 0;
     }
-    return result;
+    assert(PyUnicode_GET_LENGTH(s) > 0);
+    if (invalid < PyUnicode_GET_LENGTH(s)) {
+        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
+        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
+            /* Determine the offset in UTF-8 encoded input */
+            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
+            if (s != NULL) {
+                Py_SETREF(s, PyUnicode_AsUTF8String(s));
+            }
+            if (s == NULL) {
+                tok->done = E_ERROR;
+                return 0;
+            }
+            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
+        }
+        Py_DECREF(s);
+        // PyUnicode_FromFormatV() does not support %X
+        char hex[9];
+        snprintf(hex, sizeof(hex), "%04X", ch);
+        if (Py_UNICODE_ISPRINTABLE(ch)) {
+            syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+        }
+        else {
+            syntaxerror(tok, "invalid non-printable character U+%s", hex);
+        }
+        return 0;
+    }
+    Py_DECREF(s);
+    return 1;
 }
 
 static int
author	Serhiy Storchaka <storchaka@gmail.com>	2020-05-12 09:42:04 (GMT)
committer	GitHub <noreply@github.com>	2020-05-12 09:42:04 (GMT)
commit	74ea6b5a7501fb393cd567fb21998d0bfeeb267c (patch)
tree	6f82cb1ae91f9cc21e0181f7284039b7d58d0309 /Parser
parent	f3a5b7ada0c951f317dbd307de4b410e58d3e1b3 (diff)
download	cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.zip cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.gz cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.bz2