bpo-43833: Emit warnings for numeric literals followed by keyword (GH-25466)

Emit a deprecation warning if the numeric literal is immediately followed by one of keywords: and, else, for, if, in, is, or. Raise a syntax error with more informative message if it is immediately followed by other keyword or identifier. Automerge-Triggered-By: GH:pablogsal
author: Serhiy Storchaka <storchaka@gmail.com> 2021-06-08 23:31:10 (GMT)
committer: GitHub <noreply@github.com> 2021-06-08 23:31:10 (GMT)
commit: 2ea6d890281c415e0a2f00e63526e592da8ce3d9 (patch)
tree: 0aff5648b72d3c7b4557408f8adfce932c59a237
parent: 3e1c7167d86a2a928cdcb659094aa10bb5550c4c (diff)
download: cpython-2ea6d890281c415e0a2f00e63526e592da8ce3d9.zip
cpython-2ea6d890281c415e0a2f00e63526e592da8ce3d9.tar.gz
cpython-2ea6d890281c415e0a2f00e63526e592da8ce3d9.tar.bz2
5 files changed, 246 insertions, 3 deletions
diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst
index 74e6b03..df9806a 100644
--- a/Doc/whatsnew/3.10.rst
+++ b/Doc/whatsnew/3.10.rst
@@ -1444,6 +1444,17 @@ Optimizations
 Deprecated
 ==========
 
+* Currently Python accepts numeric literals immediately followed by keywords,
+  for example ``0in x``, ``1or x``, ``0if 1else 2``.  It allows confusing
+  and ambigious expressions like ``[0x1for x in y]`` (which can be
+  interpreted as ``[0x1 for x in y]`` or ``[0x1f or x in y]``).  Starting in
+  this release, a deprecation warning is raised if the numeric literal is
+  immediately followed by one of keywords :keyword:`and`, :keyword:`else`,
+  :keyword:`for`, :keyword:`if`, :keyword:`in`, :keyword:`is` and :keyword:`or`.
+  If future releases it will be changed to syntax warning, and finally to
+  syntax error.
+  (Contributed by Serhiy Storchaka in :issue:`43833`).
+
 * Starting in this release, there will be a concerted effort to begin
   cleaning up old import semantics that were kept for Python 2.7
   compatibility. Specifically,
@@ -1670,6 +1681,18 @@ This section lists previously described changes and other bugfixes
 that may require changes to your code.
 
 
+Changes in the Python syntax
+----------------------------
+
+* Deprecation warning is now emitted when compiling previously valid syntax
+  if the numeric literal is immediately followed by a keyword (like in ``0in x``).
+  If future releases it will be changed to syntax warning, and finally to a
+  syntax error.  To get rid of the warning and make the code compatible with
+  future releases just add a space between the numeric literal and the
+  following keyword.
+  (Contributed by Serhiy Storchaka in :issue:`43833`).
+
+
 Changes in the Python API
 -------------------------
 
diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py
index d40347c..ea8ae22 100644
--- a/Lib/test/test_compile.py
+++ b/Lib/test/test_compile.py
@@ -162,7 +162,7 @@ if 1:
         for arg in ["077787", "0xj", "0x.", "0e",  "090000000000000",
                     "080000000000000", "000000000000009", "000000000000008",
                     "0b42", "0BADCAFE", "0o123456789", "0b1.1", "0o4.2",
-                    "0b101j2", "0o153j2", "0b100e1", "0o777e1", "0777",
+                    "0b101j", "0o153j", "0b100e1", "0o777e1", "0777",
                     "000777", "000000000000007"]:
             self.assertRaises(SyntaxError, eval, arg)
 
diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py
index ebc9dde..c0820fd 100644
--- a/Lib/test/test_grammar.py
+++ b/Lib/test/test_grammar.py
@@ -177,8 +177,10 @@ class TokenTests(unittest.TestCase):
 
     def test_float_exponent_tokenization(self):
         # See issue 21642.
-        self.assertEqual(1 if 1else 0, 1)
-        self.assertEqual(1 if 0else 0, 0)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', DeprecationWarning)
+            self.assertEqual(eval("1 if 1else 0"), 1)
+            self.assertEqual(eval("1 if 0else 0"), 0)
         self.assertRaises(SyntaxError, eval, "0 if 1Else 0")
 
     def test_underscore_literals(self):
@@ -211,6 +213,92 @@ class TokenTests(unittest.TestCase):
         check("1e2_", "invalid decimal literal")
         check("1e+", "invalid decimal literal")
 
+    def test_end_of_numerical_literals(self):
+        def check(test):
+            with self.assertWarns(DeprecationWarning):
+                compile(test, "<testcase>", "eval")
+
+        def check_error(test):
+            with warnings.catch_warnings(record=True) as w:
+                with self.assertRaises(SyntaxError):
+                    compile(test, "<testcase>", "eval")
+            self.assertEqual(w,  [])
+
+        check_error("0xfand x")
+        check("0o7and x")
+        check("0b1and x")
+        check("9and x")
+        check("0and x")
+        check("1.and x")
+        check("1e3and x")
+        check("1jand x")
+
+        check("0xfor x")
+        check("0o7or x")
+        check("0b1or x")
+        check("9or x")
+        check_error("0or x")
+        check("1.or x")
+        check("1e3or x")
+        check("1jor x")
+
+        check("0xfin x")
+        check("0o7in x")
+        check("0b1in x")
+        check("9in x")
+        check("0in x")
+        check("1.in x")
+        check("1e3in x")
+        check("1jin x")
+
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', SyntaxWarning)
+            check("0xfis x")
+            check("0o7is x")
+            check("0b1is x")
+            check("9is x")
+            check("0is x")
+            check("1.is x")
+            check("1e3is x")
+            check("1jis x")
+
+        check("0xfif x else y")
+        check("0o7if x else y")
+        check("0b1if x else y")
+        check("9if x else y")
+        check("0if x else y")
+        check("1.if x else y")
+        check("1e3if x else y")
+        check("1jif x else y")
+
+        check_error("x if 0xfelse y")
+        check("x if 0o7else y")
+        check("x if 0b1else y")
+        check("x if 9else y")
+        check("x if 0else y")
+        check("x if 1.else y")
+        check("x if 1e3else y")
+        check("x if 1jelse y")
+
+        check("[0x1ffor x in ()]")
+        check("[0x1for x in ()]")
+        check("[0xfor x in ()]")
+        check("[0o7for x in ()]")
+        check("[0b1for x in ()]")
+        check("[9for x in ()]")
+        check("[1.for x in ()]")
+        check("[1e3for x in ()]")
+        check("[1jfor x in ()]")
+
+        check_error("0xfspam")
+        check_error("0o7spam")
+        check_error("0b1spam")
+        check_error("9spam")
+        check_error("0spam")
+        check_error("1.spam")
+        check_error("1e3spam")
+        check_error("1jspam")
+
     def test_string_literals(self):
         x = ''; y = ""; self.assertTrue(len(x) == 0 and x == y)
         x = '\''; y = "'"; self.assertTrue(len(x) == 1 and x == y and ord(x) == 39)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-04-18-18-07-33.bpo-43833.oChkCi.rst b/Misc/NEWS.d/next/Core and Builtins/2021-04-18-18-07-33.bpo-43833.oChkCi.rst
new file mode 100644
index 0000000..2adbdba
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-04-18-18-07-33.bpo-43833.oChkCi.rst
@@ -0,0 +1,4 @@
+Emit a deprecation warning if the numeric literal is immediately followed by
+one of keywords: and, else, for, if, in, is, or. Raise a syntax error with
+more informative message if it is immediately followed by other keyword or
+identifier.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index a86af9b..6002f3e 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1121,6 +1121,113 @@ indenterror(struct tok_state *tok)
     return ERRORTOKEN;
 }
 
+static int
+parser_warn(struct tok_state *tok, const char *format, ...)
+{
+    PyObject *errmsg;
+    va_list vargs;
+#ifdef HAVE_STDARG_PROTOTYPES
+    va_start(vargs, format);
+#else
+    va_start(vargs);
+#endif
+    errmsg = PyUnicode_FromFormatV(format, vargs);
+    va_end(vargs);
+    if (!errmsg) {
+        goto error;
+    }
+
+    if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
+                                 tok->lineno, NULL, NULL) < 0) {
+        if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
+            /* Replace the DeprecationWarning exception with a SyntaxError
+               to get a more accurate error report */
+            PyErr_Clear();
+            syntaxerror(tok, "%U", errmsg);
+        }
+        goto error;
+    }
+    Py_DECREF(errmsg);
+    return 0;
+
+error:
+    Py_XDECREF(errmsg);
+    tok->done = E_ERROR;
+    return -1;
+}
+
+static int
+lookahead(struct tok_state *tok, const char *test)
+{
+    const char *s = test;
+    int res = 0;
+    while (1) {
+        int c = tok_nextc(tok);
+        if (*s == 0) {
+            res = !is_potential_identifier_char(c);
+        }
+        else if (c == *s) {
+            s++;
+            continue;
+        }
+
+        tok_backup(tok, c);
+        while (s != test) {
+            tok_backup(tok, *--s);
+        }
+        return res;
+    }
+}
+
+static int
+verify_end_of_number(struct tok_state *tok, int c, const char *kind)
+{
+    /* Emit a deprecation warning only if the numeric literal is immediately
+     * followed by one of keywords which can occurr after a numeric literal
+     * in valid code: "and", "else", "for", "if", "in", "is" and "or".
+     * It allows to gradually deprecate existing valid code without adding
+     * warning before error in most cases of invalid numeric literal (which
+     * would be confusiong and break existing tests).
+     * Raise a syntax error with slighly better message than plain
+     * "invalid syntax" if the numeric literal is immediately followed by
+     * other keyword or identifier.
+     */
+    int r = 0;
+    if (c == 'a') {
+        r = lookahead(tok, "nd");
+    }
+    else if (c == 'e') {
+        r = lookahead(tok, "lse");
+    }
+    else if (c == 'f') {
+        r = lookahead(tok, "or");
+    }
+    else if (c == 'i') {
+        int c2 = tok_nextc(tok);
+        if (c2 == 'f' || c2 == 'n' || c2 == 's') {
+            r = 1;
+        }
+        tok_backup(tok, c2);
+    }
+    else if (c == 'o') {
+        r = lookahead(tok, "r");
+    }
+    if (r) {
+        tok_backup(tok, c);
+        if (parser_warn(tok, "invalid %s literal", kind)) {
+            return 0;
+        }
+        tok_nextc(tok);
+    }
+    else /* In future releases, only error will remain. */
+    if (is_potential_identifier_char(c)) {
+        tok_backup(tok, c);
+        syntaxerror(tok, "invalid %s literal", kind);
+        return 0;
+    }
+    return 1;
+}
+
 /* Verify that the identifier follows PEP 3131.
    All identifier strings are guaranteed to be "ready" unicode objects.
  */
@@ -1569,6 +1676,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                         c = tok_nextc(tok);
                     } while (isxdigit(c));
                 } while (c == '_');
+                if (!verify_end_of_number(tok, c, "hexadecimal")) {
+                    return ERRORTOKEN;
+                }
             }
             else if (c == 'o' || c == 'O') {
                 /* Octal */
@@ -1595,6 +1705,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     return syntaxerror(tok,
                             "invalid digit '%c' in octal literal", c);
                 }
+                if (!verify_end_of_number(tok, c, "octal")) {
+                    return ERRORTOKEN;
+                }
             }
             else if (c == 'b' || c == 'B') {
                 /* Binary */
@@ -1621,6 +1734,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     return syntaxerror(tok,
                             "invalid digit '%c' in binary literal", c);
                 }
+                if (!verify_end_of_number(tok, c, "binary")) {
+                    return ERRORTOKEN;
+                }
             }
             else {
                 int nonzero = 0;
@@ -1664,6 +1780,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                                        "literals are not permitted; "
                                        "use an 0o prefix for octal integers");
                 }
+                if (!verify_end_of_number(tok, c, "decimal")) {
+                    return ERRORTOKEN;
+                }
             }
         }
         else {
@@ -1699,6 +1818,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                         }
                     } else if (!isdigit(c)) {
                         tok_backup(tok, c);
+                        if (!verify_end_of_number(tok, e, "decimal")) {
+                            return ERRORTOKEN;
+                        }
                         tok_backup(tok, e);
                         *p_start = tok->start;
                         *p_end = tok->cur;
@@ -1713,6 +1835,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     /* Imaginary part */
         imaginary:
                     c = tok_nextc(tok);
+                    if (!verify_end_of_number(tok, c, "imaginary")) {
+                        return ERRORTOKEN;
+                    }
+                }
+                else if (!verify_end_of_number(tok, c, "decimal")) {
+                    return ERRORTOKEN;
                 }
             }
         }
author	Serhiy Storchaka <storchaka@gmail.com>	2021-06-08 23:31:10 (GMT)
committer	GitHub <noreply@github.com>	2021-06-08 23:31:10 (GMT)
commit	2ea6d890281c415e0a2f00e63526e592da8ce3d9 (patch)
tree	0aff5648b72d3c7b4557408f8adfce932c59a237
parent	3e1c7167d86a2a928cdcb659094aa10bb5550c4c (diff)
download	cpython-2ea6d890281c415e0a2f00e63526e592da8ce3d9.zip cpython-2ea6d890281c415e0a2f00e63526e592da8ce3d9.tar.gz cpython-2ea6d890281c415e0a2f00e63526e592da8ce3d9.tar.bz2