summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_tokenize.py29
-rw-r--r--Lib/tokenize.py22
-rw-r--r--Misc/NEWS3
3 files changed, 51 insertions, 3 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 11590ea..915eda9 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -904,6 +904,35 @@ class TestDetectEncoding(TestCase):
self.assertEqual(fp.encoding, 'utf-8-sig')
self.assertEqual(fp.mode, 'r')
+ def test_filename_in_exception(self):
+ # When possible, include the file name in the exception.
+ path = 'some_file_path'
+ lines = (
+ b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
+ )
+ class Bunk:
+ def __init__(self, lines, path):
+ self.name = path
+ self._lines = lines
+ self._index = 0
+
+ def readline(self):
+ if self._index == len(lines):
+ raise StopIteration
+ line = lines[self._index]
+ self._index += 1
+ return line
+
+ with self.assertRaises(SyntaxError):
+ ins = Bunk(lines, path)
+ # Make sure lacking a name isn't an issue.
+ del ins.name
+ detect_encoding(ins.readline)
+ with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
+ ins = Bunk(lines, path)
+ detect_encoding(ins.readline)
+
+
class TestTokenize(TestCase):
def test_tokenize(self):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index c05f764..e4c9d3c 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -353,6 +353,10 @@ def detect_encoding(readline):
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
+ try:
+ filename = readline.__self__.name
+ except AttributeError:
+ filename = None
bom_found = False
encoding = None
default = 'utf-8'
@@ -369,7 +373,10 @@ def detect_encoding(readline):
# per default encoding.
line_string = line.decode('utf-8')
except UnicodeDecodeError:
- raise SyntaxError("invalid or missing encoding declaration")
+ msg = "invalid or missing encoding declaration"
+ if filename is not None:
+ msg = '{} for {!r}'.format(msg, filename)
+ raise SyntaxError(msg)
matches = cookie_re.findall(line_string)
if not matches:
@@ -379,12 +386,21 @@ def detect_encoding(readline):
codec = lookup(encoding)
except LookupError:
# This behaviour mimics the Python interpreter
- raise SyntaxError("unknown encoding: " + encoding)
+ if filename is None:
+ msg = "unknown encoding: " + encoding
+ else:
+ msg = "unknown encoding for {!r}: {}".format(filename,
+ encoding)
+ raise SyntaxError(msg)
if bom_found:
if codec.name != 'utf-8':
# This behaviour mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
+ if filename is None:
+ msg = 'encoding problem: utf-8'
+ else:
+ msg = 'encoding problem for {!r}: utf-8'.format(filename)
+ raise SyntaxError(msg)
encoding += '-sig'
return encoding
diff --git a/Misc/NEWS b/Misc/NEWS
index b98cdca..6d3410f 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -55,6 +55,9 @@ Core and Builtins
Library
-------
+- Issue #14629: tokenizer.detect_encoding will specify the filename in the
+ SyntaxError exception if found at readline.__self__.name.
+
- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the
first two lines have non-UTF-8 characters without an encoding declaration.