diff options
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/idlelib/IOBinding.py | 3 | ||||
-rw-r--r-- | Lib/lib2to3/pgen2/tokenize.py | 3 | ||||
-rw-r--r-- | Lib/test/test_tokenize.py | 33 | ||||
-rw-r--r-- | Lib/tokenize.py | 3 |
4 files changed, 42 insertions, 0 deletions
diff --git a/Lib/idlelib/IOBinding.py b/Lib/idlelib/IOBinding.py index cba8048..f008b46 100644 --- a/Lib/idlelib/IOBinding.py +++ b/Lib/idlelib/IOBinding.py @@ -64,6 +64,7 @@ encoding = locale_encoding ### KBK 07Sep07 This is used all over IDLE, check! ### 'encoding' is used below in encode(), check! coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) def coding_spec(data): """Return the encoding declaration according to PEP 263. @@ -93,6 +94,8 @@ def coding_spec(data): match = coding_re.match(line) if match is not None: break + if not blank_re.match(line): + return None else: return None name = match.group(1) diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py index b7c6461..1bb931e 100644 --- a/Lib/lib2to3/pgen2/tokenize.py +++ b/Lib/lib2to3/pgen2/tokenize.py @@ -237,6 +237,7 @@ class Untokenizer: toks_append(tokval) cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) def _get_normal_name(orig_enc): """Imitates get_normal_name in tokenizer.c.""" @@ -309,6 +310,8 @@ def detect_encoding(readline): encoding = find_cookie(first) if encoding: return encoding, [first] + if not blank_re.match(first): + return default, [first] second = read_or_stop() if not second: diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 1765085..6ed8597 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -885,6 +885,39 @@ class TestDetectEncoding(TestCase): readline = self.get_readline(lines) self.assertRaises(SyntaxError, detect_encoding, readline) + def test_cookie_second_line_noncommented_first_line(self): + lines = ( + b"print('\xc2\xa3')\n", + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + expected = [b"print('\xc2\xa3')\n"] + self.assertEqual(consumed_lines, expected) + + def test_cookie_second_line_commented_first_line(self): + lines = ( + b"#print('\xc2\xa3')\n", + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] + self.assertEqual(consumed_lines, expected) + + def test_cookie_second_line_empty_first_line(self): + lines = ( + b'\n', + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] + self.assertEqual(consumed_lines, expected) + def test_latin1_normalization(self): # See get_normal_name() in tokenizer.c. encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", diff --git a/Lib/tokenize.py b/Lib/tokenize.py index d0609e8..294bf9a 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -32,6 +32,7 @@ from codecs import lookup, BOM_UTF8 import collections from io import TextIOWrapper cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", @@ -409,6 +410,8 @@ def detect_encoding(readline): encoding = find_cookie(first) if encoding: return encoding, [first] + if not blank_re.match(first): + return default, [first] second = read_or_stop() if not second: |