From 58c0752a33253641c1423fac2d4ef3f623fbcb46 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 9 Nov 2010 01:08:59 +0000 Subject: Issue #10335: Add tokenize.open(), detect the file encoding using tokenize.detect_encoding() and open it in read only mode. --- Doc/library/tokenize.rst | 17 +++++++++-------- Lib/linecache.py | 4 +--- Lib/py_compile.py | 4 +--- Lib/tabnanny.py | 5 +---- Lib/test/test_tokenize.py | 23 ++++++++++++++++++++++- Lib/tokenize.py | 15 +++++++++++++++ Lib/trace.py | 5 ++--- Misc/NEWS | 3 +++ 8 files changed, 54 insertions(+), 22 deletions(-) diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst index dbd01c4..6a96609 100644 --- a/Doc/library/tokenize.rst +++ b/Doc/library/tokenize.rst @@ -101,14 +101,16 @@ function it uses to do this is available: If no encoding is specified, then the default of ``'utf-8'`` will be returned. - :func:`detect_encoding` is useful for robustly reading Python source files. - A common pattern for this follows:: + Use :func:`open` to open Python source files: it uses + :func:`detect_encoding` to detect the file encoding. - def read_python_source(file_name): - with open(file_name, "rb") as fp: - encoding = tokenize.detect_encoding(fp.readline)[0] - with open(file_name, "r", encoding=encoding) as fp: - return fp.read() + +.. function:: open(filename) + + Open a file in read only mode using the encoding detected by + :func:`detect_encoding`. + + .. versionadded:: 3.2 Example of a script rewriter that transforms float literals into Decimal @@ -153,4 +155,3 @@ objects:: result.append((toknum, tokval)) return untokenize(result).decode('utf-8') - diff --git a/Lib/linecache.py b/Lib/linecache.py index 974b1d9..c3f2c3f 100644 --- a/Lib/linecache.py +++ b/Lib/linecache.py @@ -123,9 +123,7 @@ def updatecache(filename, module_globals=None): else: return [] try: - with open(fullname, 'rb') as fp: - coding, line = tokenize.detect_encoding(fp.readline) - with open(fullname, 'r', encoding=coding) as fp: + with tokenize.open(fullname) as fp: lines = fp.readlines() except IOError: return [] diff --git a/Lib/py_compile.py b/Lib/py_compile.py index 111893e..d241434 100644 --- a/Lib/py_compile.py +++ b/Lib/py_compile.py @@ -104,9 +104,7 @@ def compile(file, cfile=None, dfile=None, doraise=False): byte-compile all installed files (or all files in selected directories). """ - with open(file, "rb") as f: - encoding = tokenize.detect_encoding(f.readline)[0] - with open(file, encoding=encoding) as f: + with tokenize.open(file) as f: try: timestamp = int(os.fstat(f.fileno()).st_mtime) except AttributeError: diff --git a/Lib/tabnanny.py b/Lib/tabnanny.py index 7053fd9..a4d4ef0 100755 --- a/Lib/tabnanny.py +++ b/Lib/tabnanny.py @@ -93,11 +93,8 @@ def check(file): check(fullname) return - with open(file, 'rb') as f: - encoding, lines = tokenize.detect_encoding(f.readline) - try: - f = open(file, encoding=encoding) + f = tokenize.open(file) except IOError as msg: errprint("%r: I/O Error: %s" % (file, msg)) return diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 10e59b9..f98efcb 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -564,7 +564,8 @@ Non-ascii identifiers from test import support from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, - STRING, ENDMARKER, tok_name, detect_encoding) + STRING, ENDMARKER, tok_name, detect_encoding, + open as tokenize_open) from io import BytesIO from unittest import TestCase import os, sys, glob @@ -857,6 +858,26 @@ class TestDetectEncoding(TestCase): readline = self.get_readline((b'# coding: bad\n',)) self.assertRaises(SyntaxError, detect_encoding, readline) + def test_open(self): + filename = support.TESTFN + '.py' + self.addCleanup(support.unlink, filename) + + # test coding cookie + for encoding in ('iso-8859-15', 'utf-8'): + with open(filename, 'w', encoding=encoding) as fp: + print("# coding: %s" % encoding, file=fp) + print("print('euro:\u20ac')", file=fp) + with tokenize_open(filename) as fp: + assert fp.encoding == encoding + assert fp.mode == 'r' + + # test BOM (no coding cookie) + with open(filename, 'w', encoding='utf-8-sig') as fp: + print("print('euro:\u20ac')", file=fp) + with tokenize_open(filename) as fp: + assert fp.encoding == 'utf-8-sig' + assert fp.mode == 'r' + class TestTokenize(TestCase): def test_tokenize(self): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index eb58831..7745412 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -29,6 +29,7 @@ import sys from token import * from codecs import lookup, BOM_UTF8 import collections +from io import TextIOWrapper cookie_re = re.compile("coding[:=]\s*([-\w.]+)") import token @@ -335,6 +336,20 @@ def detect_encoding(readline): return default, [first, second] +_builtin_open = open + +def open(filename): + """Open a file in read only mode using the encoding detected by + detect_encoding(). + """ + buffer = _builtin_open(filename, 'rb') + encoding, lines = detect_encoding(buffer.readline) + buffer.seek(0) + text = TextIOWrapper(buffer, encoding, line_buffering=True) + text.mode = 'r' + return text + + def tokenize(readline): """ The tokenize() generator requires one argment, readline, which diff --git a/Lib/trace.py b/Lib/trace.py index 8ea4b89..b50aa02 100644 --- a/Lib/trace.py +++ b/Lib/trace.py @@ -432,10 +432,9 @@ def find_strings(filename, encoding=None): def find_executable_linenos(filename): """Return dict where keys are line numbers in the line number table.""" try: - with io.FileIO(filename, 'r') as file: - encoding, lines = tokenize.detect_encoding(file.readline) - with open(filename, "r", encoding=encoding) as f: + with tokenize.open(filename) as f: prog = f.read() + encoding = f.encoding except IOError as err: print(("Not printing coverage data for %r: %s" % (filename, err)), file=sys.stderr) diff --git a/Misc/NEWS b/Misc/NEWS index 5586118..48f952e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -60,6 +60,9 @@ Core and Builtins Library ------- +- Issue #10335: Add tokenize.open(), detect the file encoding using + tokenize.detect_encoding() and open it in read only mode. + - Issue #10321: Added support for binary data to smtplib.SMTP.sendmail, and a new method send_message to send an email.message.Message object. -- cgit v0.12