Issue #18960: Fix bugs with Python source code encoding in the second line.

* The first line of Python script could be executed twice when the source encoding (not equal to 'utf-8') was specified on the second line. * Now the source encoding declaration on the second line isn't effective if the first line contains anything except a comment. * As a consequence, 'python -x' works now again with files with the source encoding declarations specified on the second file, and can be used again to make Python batch files on Windows. * The tokenize module now ignore the source encoding declaration on the second line if the first line contains anything except a comment. * IDLE now ignores the source encoding declaration on the second line if the first line contains anything except a comment. * 2to3 and the findnocoding.py script now ignore the source encoding declaration on the second line if the first line contains anything except a comment.
author: Serhiy Storchaka <storchaka@gmail.com> 2014-01-09 16:36:09 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2014-01-09 16:36:09 (GMT)
commit: 768c16ce0273a74fa846cc388753280b17b02cfc (patch)
tree: d2fc7f94a08fb20f882e3e0b299a59fea1251aa8 /Lib
parent: 21e7d4cd5eb5a1ee153baf4c7915db80e6ca59e1 (diff)
download: cpython-768c16ce0273a74fa846cc388753280b17b02cfc.zip
cpython-768c16ce0273a74fa846cc388753280b17b02cfc.tar.gz
cpython-768c16ce0273a74fa846cc388753280b17b02cfc.tar.bz2
4 files changed, 42 insertions, 0 deletions
diff --git a/Lib/idlelib/IOBinding.py b/Lib/idlelib/IOBinding.py
index cba8048..f008b46 100644
--- a/Lib/idlelib/IOBinding.py
+++ b/Lib/idlelib/IOBinding.py
@@ -64,6 +64,7 @@ encoding = locale_encoding  ### KBK 07Sep07  This is used all over IDLE, check!
                             ### 'encoding' is used below in encode(), check!
 
 coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 def coding_spec(data):
     """Return the encoding declaration according to PEP 263.
@@ -93,6 +94,8 @@ def coding_spec(data):
         match = coding_re.match(line)
         if match is not None:
             break
+        if not blank_re.match(line):
+            return None
     else:
         return None
     name = match.group(1)
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index b7c6461..1bb931e 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -237,6 +237,7 @@ class Untokenizer:
             toks_append(tokval)
 
 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 def _get_normal_name(orig_enc):
     """Imitates get_normal_name in tokenizer.c."""
@@ -309,6 +310,8 @@ def detect_encoding(readline):
     encoding = find_cookie(first)
     if encoding:
         return encoding, [first]
+    if not blank_re.match(first):
+        return default, [first]
 
     second = read_or_stop()
     if not second:
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 1765085..6ed8597 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -885,6 +885,39 @@ class TestDetectEncoding(TestCase):
         readline = self.get_readline(lines)
         self.assertRaises(SyntaxError, detect_encoding, readline)
 
+    def test_cookie_second_line_noncommented_first_line(self):
+        lines = (
+            b"print('\xc2\xa3')\n",
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8')
+        expected = [b"print('\xc2\xa3')\n"]
+        self.assertEqual(consumed_lines, expected)
+
+    def test_cookie_second_line_commented_first_line(self):
+        lines = (
+            b"#print('\xc2\xa3')\n",
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
+        self.assertEqual(consumed_lines, expected)
+
+    def test_cookie_second_line_empty_first_line(self):
+        lines = (
+            b'\n',
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
+        self.assertEqual(consumed_lines, expected)
+
     def test_latin1_normalization(self):
         # See get_normal_name() in tokenizer.c.
         encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index d0609e8..294bf9a 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@ from codecs import lookup, BOM_UTF8
 import collections
 from io import TextIOWrapper
 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 import token
 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
@@ -409,6 +410,8 @@ def detect_encoding(readline):
     encoding = find_cookie(first)
     if encoding:
         return encoding, [first]
+    if not blank_re.match(first):
+        return default, [first]
 
     second = read_or_stop()
     if not second:
author	Serhiy Storchaka <storchaka@gmail.com>	2014-01-09 16:36:09 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2014-01-09 16:36:09 (GMT)
commit	768c16ce0273a74fa846cc388753280b17b02cfc (patch)
tree	d2fc7f94a08fb20f882e3e0b299a59fea1251aa8 /Lib
parent	21e7d4cd5eb5a1ee153baf4c7915db80e6ca59e1 (diff)
download	cpython-768c16ce0273a74fa846cc388753280b17b02cfc.zip cpython-768c16ce0273a74fa846cc388753280b17b02cfc.tar.gz cpython-768c16ce0273a74fa846cc388753280b17b02cfc.tar.bz2