diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2020-05-12 09:42:04 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-05-12 09:42:04 (GMT) |
commit | 74ea6b5a7501fb393cd567fb21998d0bfeeb267c (patch) | |
tree | 6f82cb1ae91f9cc21e0181f7284039b7d58d0309 /Parser | |
parent | f3a5b7ada0c951f317dbd307de4b410e58d3e1b3 (diff) | |
download | cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.zip cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.gz cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.bz2 |
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
Diffstat (limited to 'Parser')
-rw-r--r-- | Parser/pegen/pegen.c | 3 | ||||
-rw-r--r-- | Parser/tokenizer.c | 46 |
2 files changed, 37 insertions, 12 deletions
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index c80f086..5f8c862 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -337,9 +337,6 @@ tokenizer_error(Parser *p) case E_TOKEN: msg = "invalid token"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_EOFS: RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal"); return -1; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0f2b6af..b81fa11 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1101,25 +1101,53 @@ static int verify_identifier(struct tok_state *tok) { PyObject *s; - int result; if (tok->decoding_erred) return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); if (s == NULL) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { - PyErr_Clear(); - tok->done = E_IDENTIFIER; - } else { + tok->done = E_DECODE; + } + else { tok->done = E_ERROR; } return 0; } - result = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - if (result == 0) { - tok->done = E_IDENTIFIER; + Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); + if (invalid < 0) { + Py_DECREF(s); + tok->done = E_ERROR; + return 0; } - return result; + assert(PyUnicode_GET_LENGTH(s) > 0); + if (invalid < PyUnicode_GET_LENGTH(s)) { + Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); + if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { + /* Determine the offset in UTF-8 encoded input */ + Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); + if (s != NULL) { + Py_SETREF(s, PyUnicode_AsUTF8String(s)); + } + if (s == NULL) { + tok->done = E_ERROR; + return 0; + } + tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); + } + Py_DECREF(s); + // PyUnicode_FromFormatV() does not support %X + char hex[9]; + snprintf(hex, sizeof(hex), "%04X", ch); + if (Py_UNICODE_ISPRINTABLE(ch)) { + syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + } + else { + syntaxerror(tok, "invalid non-printable character U+%s", hex); + } + return 0; + } + Py_DECREF(s); + return 1; } static int |