summaryrefslogtreecommitdiffstats
path: root/Parser
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2020-05-12 09:42:04 (GMT)
committerGitHub <noreply@github.com>2020-05-12 09:42:04 (GMT)
commit74ea6b5a7501fb393cd567fb21998d0bfeeb267c (patch)
tree6f82cb1ae91f9cc21e0181f7284039b7d58d0309 /Parser
parentf3a5b7ada0c951f317dbd307de4b410e58d3e1b3 (diff)
downloadcpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.zip
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.gz
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.bz2
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
Diffstat (limited to 'Parser')
-rw-r--r--Parser/pegen/pegen.c3
-rw-r--r--Parser/tokenizer.c46
2 files changed, 37 insertions, 12 deletions
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c
index c80f086..5f8c862 100644
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
case E_TOKEN:
msg = "invalid token";
break;
- case E_IDENTIFIER:
- msg = "invalid character in identifier";
- break;
case E_EOFS:
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
return -1;
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 0f2b6af..b81fa11 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1101,25 +1101,53 @@ static int
verify_identifier(struct tok_state *tok)
{
PyObject *s;
- int result;
if (tok->decoding_erred)
return 0;
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
if (s == NULL) {
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
- PyErr_Clear();
- tok->done = E_IDENTIFIER;
- } else {
+ tok->done = E_DECODE;
+ }
+ else {
tok->done = E_ERROR;
}
return 0;
}
- result = PyUnicode_IsIdentifier(s);
- Py_DECREF(s);
- if (result == 0) {
- tok->done = E_IDENTIFIER;
+ Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
+ if (invalid < 0) {
+ Py_DECREF(s);
+ tok->done = E_ERROR;
+ return 0;
}
- return result;
+ assert(PyUnicode_GET_LENGTH(s) > 0);
+ if (invalid < PyUnicode_GET_LENGTH(s)) {
+ Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
+ if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
+ /* Determine the offset in UTF-8 encoded input */
+ Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
+ if (s != NULL) {
+ Py_SETREF(s, PyUnicode_AsUTF8String(s));
+ }
+ if (s == NULL) {
+ tok->done = E_ERROR;
+ return 0;
+ }
+ tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
+ }
+ Py_DECREF(s);
+ // PyUnicode_FromFormatV() does not support %X
+ char hex[9];
+ snprintf(hex, sizeof(hex), "%04X", ch);
+ if (Py_UNICODE_ISPRINTABLE(ch)) {
+ syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+ }
+ else {
+ syntaxerror(tok, "invalid non-printable character U+%s", hex);
+ }
+ return 0;
+ }
+ Py_DECREF(s);
+ return 1;
}
static int