Issue #18960: Fix bugs with Python source code encoding in the second line.

* The first line of Python script could be executed twice when the source encoding (not equal to 'utf-8') was specified on the second line. * Now the source encoding declaration on the second line isn't effective if the first line contains anything except a comment. * As a consequence, 'python -x' works now again with files with the source encoding declarations specified on the second file, and can be used again to make Python batch files on Windows. * The tokenize module now ignore the source encoding declaration on the second line if the first line contains anything except a comment. * IDLE now ignores the source encoding declaration on the second line if the first line contains anything except a comment. * 2to3 and the findnocoding.py script now ignore the source encoding declaration on the second line if the first line contains anything except a comment.
author: Serhiy Storchaka <storchaka@gmail.com> 2014-01-09 16:41:59 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2014-01-09 16:41:59 (GMT)
commit: 7282ff6d5b56825e74c0715aea86e927d2fd339f (patch)
tree: 5bdc07d3601764c5cecdb78c276151f4ba03eef6
parent: 766e10c4a808727ecefca7dec59819121477d27f (diff)
parent: 768c16ce0273a74fa846cc388753280b17b02cfc (diff)
download: cpython-7282ff6d5b56825e74c0715aea86e927d2fd339f.zip
cpython-7282ff6d5b56825e74c0715aea86e927d2fd339f.tar.gz
cpython-7282ff6d5b56825e74c0715aea86e927d2fd339f.tar.bz2
7 files changed, 87 insertions, 5 deletions
diff --git a/Lib/idlelib/IOBinding.py b/Lib/idlelib/IOBinding.py
index 7589ab8..3cd7a4c 100644
--- a/Lib/idlelib/IOBinding.py
+++ b/Lib/idlelib/IOBinding.py
@@ -64,6 +64,7 @@ encoding = locale_encoding  ### KBK 07Sep07  This is used all over IDLE, check!
                             ### 'encoding' is used below in encode(), check!
 
 coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 def coding_spec(data):
     """Return the encoding declaration according to PEP 263.
@@ -93,6 +94,8 @@ def coding_spec(data):
         match = coding_re.match(line)
         if match is not None:
             break
+        if not blank_re.match(line):
+            return None
     else:
         return None
     name = match.group(1)
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index b7c6461..1bb931e 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -237,6 +237,7 @@ class Untokenizer:
             toks_append(tokval)
 
 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 def _get_normal_name(orig_enc):
     """Imitates get_normal_name in tokenizer.c."""
@@ -309,6 +310,8 @@ def detect_encoding(readline):
     encoding = find_cookie(first)
     if encoding:
         return encoding, [first]
+    if not blank_re.match(first):
+        return default, [first]
 
     second = read_or_stop()
     if not second:
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 1765085..6ed8597 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -885,6 +885,39 @@ class TestDetectEncoding(TestCase):
         readline = self.get_readline(lines)
         self.assertRaises(SyntaxError, detect_encoding, readline)
 
+    def test_cookie_second_line_noncommented_first_line(self):
+        lines = (
+            b"print('\xc2\xa3')\n",
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8')
+        expected = [b"print('\xc2\xa3')\n"]
+        self.assertEqual(consumed_lines, expected)
+
+    def test_cookie_second_line_commented_first_line(self):
+        lines = (
+            b"#print('\xc2\xa3')\n",
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
+        self.assertEqual(consumed_lines, expected)
+
+    def test_cookie_second_line_empty_first_line(self):
+        lines = (
+            b'\n',
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
+        self.assertEqual(consumed_lines, expected)
+
     def test_latin1_normalization(self):
         # See get_normal_name() in tokenizer.c.
         encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index f614aeb..7785c98 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@ from codecs import lookup, BOM_UTF8
 import collections
 from io import TextIOWrapper
 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 import token
 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
@@ -409,6 +410,8 @@ def detect_encoding(readline):
     encoding = find_cookie(first)
     if encoding:
         return encoding, [first]
+    if not blank_re.match(first):
+        return default, [first]
 
     second = read_or_stop()
     if not second:
diff --git a/Misc/NEWS b/Misc/NEWS
index 72e26b8..cb59a68 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,13 @@ Release date: 2014-01-19
 Core and Builtins
 -----------------
 
+- Issue #18960: The first line of Python script could be executed twice when
+  the source encoding was specified on the second line.  Now the source encoding
+  declaration on the second line isn't effective if the first line contains
+  anything except a comment.  'python -x' works now again with files with the
+  source encoding declarations, and can be used to make Python batch files
+  on Windows.
+
 - Issue #19081: When a zipimport .zip file in sys.path being imported from
   is modified during the lifetime of the Python process after zipimport has
   already cached the zip's table of contents we detect this and recover
@@ -18,6 +25,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #18960: The tokenize module now ignore the source encoding declaration
+  on the second line if the first line contains anything except a comment.
+
 - Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU
   consumption.
 
@@ -33,9 +43,19 @@ Library
 
 - Issue #20072: Fixed multiple errors in tkinter with wantobjects is False.
 
+IDLE
+----
+
+- Issue #18960: IDLE now ignores the source encoding declaration on the second
+  line if the first line contains anything except a comment.
+
 Tools/Demos
 -----------
 
+- Issue #18960: 2to3 and the findnocoding.py script now ignore the source
+  encoding declaration on the second line if the first line contains anything
+  except a comment.
+
 - Issue #19723: The marker comments Argument Clinic uses have been changed
   to improve readability.
 
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 5bf7e84..0c95b63 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -283,13 +283,27 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
     char *cs;
     int r = 1;
 
-    if (tok->cont_line)
+    if (tok->cont_line) {
         /* It's a continuation line, so it can't be a coding spec. */
+        tok->read_coding_spec = 1;
         return 1;
+    }
     if (!get_coding_spec(line, &cs, size, tok))
         return 0;
-    if (!cs)
+    if (!cs) {
+        Py_ssize_t i;
+        for (i = 0; i < size; i++) {
+            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
+                break;
+            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
+                /* Stop checking coding spec after a line containing
+                 * anything except a comment. */
+                tok->read_coding_spec = 1;
+                break;
+            }
+        }
         return 1;
+    }
     tok->read_coding_spec = 1;
     if (tok->encoding == NULL) {
         assert(tok->decoding_state == STATE_RAW);
@@ -476,13 +490,17 @@ fp_setreadl(struct tok_state *tok, const char* enc)
     _Py_IDENTIFIER(open);
     _Py_IDENTIFIER(readline);
     int fd;
+    long pos;
 
     io = PyImport_ImportModuleNoBlock("io");
     if (io == NULL)
         goto cleanup;
 
     fd = fileno(tok->fp);
-    if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
+    /* Due to buffering the file offset for fd can be different from the file
+     * position of tok->fp. */
+    pos = ftell(tok->fp);
+    if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
         goto cleanup;
     }
@@ -752,7 +770,7 @@ decode_str(const char *input, int single, struct tok_state *tok)
     if (newl[0]) {
         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
             return error_ret(tok);
-        if (tok->enc == NULL && newl[1]) {
+        if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                    tok, buf_setreadl))
                 return error_ret(tok);
diff --git a/Tools/scripts/findnocoding.py b/Tools/scripts/findnocoding.py
index c0997d6..5f3795e 100755
--- a/Tools/scripts/findnocoding.py
+++ b/Tools/scripts/findnocoding.py
@@ -33,6 +33,7 @@ except ImportError:
 
 
 decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)')
+blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)')
 
 def get_declaration(line):
     match = decl_re.match(line)
@@ -58,7 +59,8 @@ def needs_declaration(fullpath):
         line1 = infile.readline()
         line2 = infile.readline()
 
-        if get_declaration(line1) or get_declaration(line2):
+        if (get_declaration(line1) or
+            blank_re.match(line1) and get_declaration(line2)):
             # the file does have an encoding declaration, so trust it
             return False
author	Serhiy Storchaka <storchaka@gmail.com>	2014-01-09 16:41:59 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2014-01-09 16:41:59 (GMT)
commit	7282ff6d5b56825e74c0715aea86e927d2fd339f (patch)
tree	5bdc07d3601764c5cecdb78c276151f4ba03eef6
parent	766e10c4a808727ecefca7dec59819121477d27f (diff)
parent	768c16ce0273a74fa846cc388753280b17b02cfc (diff)
download	cpython-7282ff6d5b56825e74c0715aea86e927d2fd339f.zip cpython-7282ff6d5b56825e74c0715aea86e927d2fd339f.tar.gz cpython-7282ff6d5b56825e74c0715aea86e927d2fd339f.tar.bz2