summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_tokenize.py2
-rw-r--r--Lib/tokenize.py33
-rw-r--r--Misc/NEWS3
3 files changed, 25 insertions, 13 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 8fbd216..75a7a80 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -795,6 +795,8 @@ class TestDetectEncoding(TestCase):
self.assertEquals(encoding, 'utf-8')
self.assertEquals(consumed_lines, [])
+ readline = self.get_readline((b'# coding: bad\n',))
+ self.assertRaises(SyntaxError, detect_encoding, readline)
class TestTokenize(TestCase):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index ec5a79a..16c4f3f 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
import re, string, sys
from token import *
-from codecs import lookup
+from codecs import lookup, BOM_UTF8
from itertools import chain, repeat
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
@@ -251,11 +251,11 @@ def detect_encoding(readline):
It detects the encoding from the presence of a utf-8 bom or an encoding
cookie as specified in pep-0263. If both a bom and a cookie are present,
- but disagree, a SyntaxError will be raised.
+ but disagree, a SyntaxError will be raised. If the encoding cookie is an
+ invalid charset, raise a SyntaxError.
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
- utf8_bom = b'\xef\xbb\xbf'
bom_found = False
encoding = None
def read_or_stop():
@@ -268,18 +268,25 @@ def detect_encoding(readline):
try:
line_string = line.decode('ascii')
except UnicodeDecodeError:
- pass
- else:
- matches = cookie_re.findall(line_string)
- if matches:
- encoding = matches[0]
- if bom_found and lookup(encoding).name != 'utf-8':
- # This behaviour mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
- return encoding
+ return None
+
+ matches = cookie_re.findall(line_string)
+ if not matches:
+ return None
+ encoding = matches[0]
+ try:
+ codec = lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError("unknown encoding: " + encoding)
+
+ if bom_found and codec.name != 'utf-8':
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError('encoding problem: utf-8')
+ return encoding
first = read_or_stop()
- if first.startswith(utf8_bom):
+ if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
if not first:
diff --git a/Misc/NEWS b/Misc/NEWS
index 10a0745..6568a1c 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -45,6 +45,9 @@ Core and Builtins
Library
-------
+- Issue #4021: tokenize.detect_encoding() now raises a SyntaxError when the
+ codec cannot be found. This is for compatibility with the builtin behavior.
+
- Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to
give correct results in the case where one argument is a quiet NaN
and the other is a finite number that requires rounding.