summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2020-05-12 09:42:04 (GMT)
committerGitHub <noreply@github.com>2020-05-12 09:42:04 (GMT)
commit74ea6b5a7501fb393cd567fb21998d0bfeeb267c (patch)
tree6f82cb1ae91f9cc21e0181f7284039b7d58d0309
parentf3a5b7ada0c951f317dbd307de4b410e58d3e1b3 (diff)
downloadcpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.zip
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.gz
cpython-74ea6b5a7501fb393cd567fb21998d0bfeeb267c.tar.bz2
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
-rw-r--r--Include/cpython/unicodeobject.h2
-rw-r--r--Include/errcode.h1
-rw-r--r--Lib/test/test_fstring.py2
-rw-r--r--Lib/test/test_source_encoding.py3
-rw-r--r--Lib/test/test_unicode_identifiers.py8
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst1
-rw-r--r--Objects/unicodeobject.c64
-rw-r--r--Parser/pegen/pegen.c3
-rw-r--r--Parser/tokenizer.c46
-rw-r--r--Python/pythonrun.c3
10 files changed, 90 insertions, 43 deletions
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index 81a35cd..9432687 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
and where the hash values are equal (i.e. a very probable match) */
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
+PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
+
#ifdef __cplusplus
}
#endif
diff --git a/Include/errcode.h b/Include/errcode.h
index b37cd26..790518b 100644
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -29,7 +29,6 @@ extern "C" {
#define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
-#define E_IDENTIFIER 26 /* Invalid characters in identifier */
#define E_BADSINGLE 27 /* Ill-formed single statement input */
#ifdef __cplusplus
diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py
index ac5aa9a..e0bb5b5 100644
--- a/Lib/test/test_fstring.py
+++ b/Lib/test/test_fstring.py
@@ -583,7 +583,7 @@ non-important content
])
# Different error message is raised for other whitespace characters.
- self.assertAllRaise(SyntaxError, 'invalid character in identifier',
+ self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0",
["f'''{\xa0}'''",
"\xa0",
])
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index a0bd741..5ca4346 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -57,6 +57,9 @@ class MiscSourceEncodingTest(unittest.TestCase):
# one byte in common with the UTF-16-LE BOM
self.assertRaises(SyntaxError, eval, b'\xff\x20')
+ # one byte in common with the UTF-8 BOM
+ self.assertRaises(SyntaxError, eval, b'\xef\x20')
+
# two bytes in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
diff --git a/Lib/test/test_unicode_identifiers.py b/Lib/test/test_unicode_identifiers.py
index 07332c4..5b9ced5 100644
--- a/Lib/test/test_unicode_identifiers.py
+++ b/Lib/test/test_unicode_identifiers.py
@@ -20,9 +20,11 @@ class PEP3131Test(unittest.TestCase):
def test_invalid(self):
try:
from test import badsyntax_3131
- except SyntaxError as s:
- self.assertEqual(str(s),
- "invalid character in identifier (badsyntax_3131.py, line 2)")
+ except SyntaxError as err:
+ self.assertEqual(str(err),
+ "invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
+ self.assertEqual(err.lineno, 2)
+ self.assertEqual(err.offset, 1)
else:
self.fail("expected exception didn't occur")
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst
new file mode 100644
index 0000000..5587d4f
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst
@@ -0,0 +1 @@
+Improved syntax errors for invalid characters in source code.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 18b9458..276547c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self)
Py_RETURN_TRUE;
}
-int
-PyUnicode_IsIdentifier(PyObject *self)
+Py_ssize_t
+_PyUnicode_ScanIdentifier(PyObject *self)
{
Py_ssize_t i;
- int ready = PyUnicode_IS_READY(self);
+ if (PyUnicode_READY(self) == -1)
+ return -1;
- Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
}
- int kind = 0;
- const void *data = NULL;
- const wchar_t *wstr = NULL;
- Py_UCS4 ch;
- if (ready) {
- kind = PyUnicode_KIND(self);
- data = PyUnicode_DATA(self);
- ch = PyUnicode_READ(kind, data, 0);
- }
- else {
- wstr = _PyUnicode_WSTR(self);
- ch = wstr[0];
- }
+ int kind = PyUnicode_KIND(self);
+ const void *data = PyUnicode_DATA(self);
+ Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
/* PEP 3131 says that the first character must be in
XID_Start and subsequent characters in XID_Continue,
and for the ASCII range, the 2.x rules apply (i.e
@@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self)
}
for (i = 1; i < len; i++) {
- if (ready) {
- ch = PyUnicode_READ(kind, data, i);
+ ch = PyUnicode_READ(kind, data, i);
+ if (!_PyUnicode_IsXidContinue(ch)) {
+ return i;
}
- else {
- ch = wstr[i];
+ }
+ return i;
+}
+
+int
+PyUnicode_IsIdentifier(PyObject *self)
+{
+ if (PyUnicode_IS_READY(self)) {
+ Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+ /* an empty string is not a valid identifier */
+ return len && i == len;
+ }
+ else {
+ Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
+ if (len == 0) {
+ /* an empty string is not a valid identifier */
+ return 0;
}
- if (!_PyUnicode_IsXidContinue(ch)) {
+
+ const wchar_t *wstr = _PyUnicode_WSTR(self);
+ Py_UCS4 ch = wstr[0];
+ if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
return 0;
}
+
+ for (i = 1; i < len; i++) {
+ ch = wstr[i];
+ if (!_PyUnicode_IsXidContinue(ch)) {
+ return 0;
+ }
+ }
+ return 1;
}
- return 1;
}
/*[clinic input]
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c
index c80f086..5f8c862 100644
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
case E_TOKEN:
msg = "invalid token";
break;
- case E_IDENTIFIER:
- msg = "invalid character in identifier";
- break;
case E_EOFS:
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
return -1;
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 0f2b6af..b81fa11 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1101,25 +1101,53 @@ static int
verify_identifier(struct tok_state *tok)
{
PyObject *s;
- int result;
if (tok->decoding_erred)
return 0;
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
if (s == NULL) {
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
- PyErr_Clear();
- tok->done = E_IDENTIFIER;
- } else {
+ tok->done = E_DECODE;
+ }
+ else {
tok->done = E_ERROR;
}
return 0;
}
- result = PyUnicode_IsIdentifier(s);
- Py_DECREF(s);
- if (result == 0) {
- tok->done = E_IDENTIFIER;
+ Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
+ if (invalid < 0) {
+ Py_DECREF(s);
+ tok->done = E_ERROR;
+ return 0;
}
- return result;
+ assert(PyUnicode_GET_LENGTH(s) > 0);
+ if (invalid < PyUnicode_GET_LENGTH(s)) {
+ Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
+ if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
+ /* Determine the offset in UTF-8 encoded input */
+ Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
+ if (s != NULL) {
+ Py_SETREF(s, PyUnicode_AsUTF8String(s));
+ }
+ if (s == NULL) {
+ tok->done = E_ERROR;
+ return 0;
+ }
+ tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
+ }
+ Py_DECREF(s);
+ // PyUnicode_FromFormatV() does not support %X
+ char hex[9];
+ snprintf(hex, sizeof(hex), "%04X", ch);
+ if (Py_UNICODE_ISPRINTABLE(ch)) {
+ syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+ }
+ else {
+ syntaxerror(tok, "invalid non-printable character U+%s", hex);
+ }
+ return 0;
+ }
+ Py_DECREF(s);
+ return 1;
}
static int
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index 1b79a33..45f08b7 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
msg = "unexpected character after line continuation character";
break;
- case E_IDENTIFIER:
- msg = "invalid character in identifier";
- break;
case E_BADSINGLE:
msg = "multiple statements found while compiling a single statement";
break;