From d481e3d7914d20238c62c76991255b3b2b5e4a17 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Sat, 9 May 2009 19:42:23 +0000 Subject: Merged revisions 72494 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ................ r72494 | benjamin.peterson | 2009-05-08 20:01:14 -0500 (Fri, 08 May 2009) | 21 lines Merged revisions 72491-72493 via svnmerge from svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3 ........ r72491 | benjamin.peterson | 2009-05-08 19:33:27 -0500 (Fri, 08 May 2009) | 7 lines make 2to3 use unicode internally on 2.x This started out as a fix for #2660, but became this large refactoring when I realized the dire state this was in. 2to3 now uses tokenize.detect_encoding to decode the files correctly into unicode. ........ r72492 | benjamin.peterson | 2009-05-08 19:35:38 -0500 (Fri, 08 May 2009) | 1 line remove compat code ........ r72493 | benjamin.peterson | 2009-05-08 19:54:15 -0500 (Fri, 08 May 2009) | 1 line add a test for \r\n newlines ........ ................ --- Lib/lib2to3/fixes/fix_imports.py | 2 +- Lib/lib2to3/fixes/fix_methodattrs.py | 2 +- Lib/lib2to3/fixes/fix_renames.py | 2 +- Lib/lib2to3/fixes/fix_types.py | 2 +- Lib/lib2to3/main.py | 6 +-- Lib/lib2to3/patcomp.py | 2 +- Lib/lib2to3/pgen2/driver.py | 5 +- Lib/lib2to3/pgen2/tokenize.py | 70 ++++++++++++++++++++++++++++ Lib/lib2to3/pytree.py | 14 +++++- Lib/lib2to3/refactor.py | 70 +++++++++++++++++++--------- Lib/lib2to3/tests/data/crlf.py | 3 ++ Lib/lib2to3/tests/data/different_encoding.py | 4 ++ Lib/lib2to3/tests/support.py | 13 +----- Lib/lib2to3/tests/test_all_fixers.py | 2 +- Lib/lib2to3/tests/test_parser.py | 22 +++++++-- Lib/lib2to3/tests/test_refactor.py | 40 ++++++++++++---- 16 files changed, 200 insertions(+), 59 deletions(-) create mode 100644 Lib/lib2to3/tests/data/crlf.py create mode 100644 Lib/lib2to3/tests/data/different_encoding.py diff --git a/Lib/lib2to3/fixes/fix_imports.py b/Lib/lib2to3/fixes/fix_imports.py index 46ba4a2..f79ad63 100644 --- a/Lib/lib2to3/fixes/fix_imports.py +++ b/Lib/lib2to3/fixes/fix_imports.py @@ -123,7 +123,7 @@ class FixImports(fixer_base.BaseFix): import_mod = results.get("module_name") if import_mod: mod_name = import_mod.value - new_name = self.mapping[mod_name] + new_name = str(self.mapping[mod_name]) import_mod.replace(Name(new_name, prefix=import_mod.get_prefix())) if "name_import" in results: # If it's not a "from x import x, y" or "import x as y" import, diff --git a/Lib/lib2to3/fixes/fix_methodattrs.py b/Lib/lib2to3/fixes/fix_methodattrs.py index ae4096c..814455e 100644 --- a/Lib/lib2to3/fixes/fix_methodattrs.py +++ b/Lib/lib2to3/fixes/fix_methodattrs.py @@ -19,5 +19,5 @@ class FixMethodattrs(fixer_base.BaseFix): def transform(self, node, results): attr = results["attr"][0] - new = MAP[attr.value] + new = str(MAP[attr.value]) attr.replace(Name(new, prefix=attr.get_prefix())) diff --git a/Lib/lib2to3/fixes/fix_renames.py b/Lib/lib2to3/fixes/fix_renames.py index 3049610..a85813f 100644 --- a/Lib/lib2to3/fixes/fix_renames.py +++ b/Lib/lib2to3/fixes/fix_renames.py @@ -65,5 +65,5 @@ class FixRenames(fixer_base.BaseFix): #import_mod = results.get("module") if mod_name and attr_name: - new_attr = LOOKUP[(mod_name.value, attr_name.value)] + new_attr = str(LOOKUP[(mod_name.value, attr_name.value)]) attr_name.replace(Name(new_attr, prefix=attr_name.get_prefix())) diff --git a/Lib/lib2to3/fixes/fix_types.py b/Lib/lib2to3/fixes/fix_types.py index 445f1b2..59fd011 100644 --- a/Lib/lib2to3/fixes/fix_types.py +++ b/Lib/lib2to3/fixes/fix_types.py @@ -56,7 +56,7 @@ class FixTypes(fixer_base.BaseFix): PATTERN = '|'.join(_pats) def transform(self, node, results): - new_value = _TYPE_MAPPING.get(results["name"].value) + new_value = str(_TYPE_MAPPING.get(results["name"].value)) if new_value: return Name(new_value, prefix=node.get_prefix()) return None diff --git a/Lib/lib2to3/main.py b/Lib/lib2to3/main.py index 084fc0c..e1adc88 100644 --- a/Lib/lib2to3/main.py +++ b/Lib/lib2to3/main.py @@ -23,7 +23,7 @@ class StdoutRefactoringTool(refactor.MultiprocessRefactoringTool): self.errors.append((msg, args, kwargs)) self.logger.error(msg, *args, **kwargs) - def write_file(self, new_text, filename, old_text): + def write_file(self, new_text, filename, old_text, encoding): if not self.nobackups: # Make backup backup = filename + ".bak" @@ -37,8 +37,8 @@ class StdoutRefactoringTool(refactor.MultiprocessRefactoringTool): except os.error as err: self.log_message("Can't rename %s to %s", filename, backup) # Actually write the new file - super(StdoutRefactoringTool, self).write_file(new_text, - filename, old_text) + write = super(StdoutRefactoringTool, self).write_file + write(new_text, filename, old_text, encoding) if not self.nobackups: shutil.copymode(backup, filename) diff --git a/Lib/lib2to3/patcomp.py b/Lib/lib2to3/patcomp.py index 7826f90..076fdc1 100644 --- a/Lib/lib2to3/patcomp.py +++ b/Lib/lib2to3/patcomp.py @@ -133,7 +133,7 @@ class PatternCompiler(object): assert len(nodes) >= 1 node = nodes[0] if node.type == token.STRING: - value = literals.evalString(node.value) + value = str(literals.evalString(node.value)) return pytree.LeafPattern(content=value) elif node.type == token.NAME: value = node.value diff --git a/Lib/lib2to3/pgen2/driver.py b/Lib/lib2to3/pgen2/driver.py index a025b37..ee77a13 100644 --- a/Lib/lib2to3/pgen2/driver.py +++ b/Lib/lib2to3/pgen2/driver.py @@ -16,6 +16,7 @@ __author__ = "Guido van Rossum " __all__ = ["Driver", "load_grammar"] # Python imports +import codecs import os import logging import sys @@ -90,9 +91,9 @@ class Driver(object): """Parse a stream and return the syntax tree.""" return self.parse_stream_raw(stream, debug) - def parse_file(self, filename, debug=False): + def parse_file(self, filename, encoding=None, debug=False): """Parse a file and return the syntax tree.""" - stream = open(filename) + stream = codecs.open(filename, "r", encoding) try: return self.parse_stream(stream, debug) finally: diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py index 33cfc33..799566b 100644 --- a/Lib/lib2to3/pgen2/tokenize.py +++ b/Lib/lib2to3/pgen2/tokenize.py @@ -30,6 +30,7 @@ __credits__ = \ 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' import string, re +from codecs import BOM_UTF8, lookup from lib2to3.pgen2.token import * from . import token @@ -228,6 +229,75 @@ class Untokenizer: startline = False toks_append(tokval) +cookie_re = re.compile("coding[:=]\s*([-\w.]+)") + +def detect_encoding(readline): + """ + The detect_encoding() function is used to detect the encoding that should + be used to decode a Python source file. It requires one argment, readline, + in the same way as the tokenize() generator. + + It will call readline a maximum of twice, and return the encoding used + (as a string) and a list of any lines (left as bytes) it has read + in. + + It detects the encoding from the presence of a utf-8 bom or an encoding + cookie as specified in pep-0263. If both a bom and a cookie are present, + but disagree, a SyntaxError will be raised. If the encoding cookie is an + invalid charset, raise a SyntaxError. + + If no encoding is specified, then the default of 'utf-8' will be returned. + """ + bom_found = False + encoding = None + def read_or_stop(): + try: + return readline() + except StopIteration: + return b'' + + def find_cookie(line): + try: + line_string = line.decode('ascii') + except UnicodeDecodeError: + return None + + matches = cookie_re.findall(line_string) + if not matches: + return None + encoding = matches[0] + try: + codec = lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + raise SyntaxError("unknown encoding: " + encoding) + + if bom_found and codec.name != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + return encoding + + first = read_or_stop() + if first.startswith(BOM_UTF8): + bom_found = True + first = first[3:] + if not first: + return 'utf-8', [] + + encoding = find_cookie(first) + if encoding: + return encoding, [first] + + second = read_or_stop() + if not second: + return 'utf-8', [first] + + encoding = find_cookie(second) + if encoding: + return encoding, [first, second] + + return 'utf-8', [first, second] + def untokenize(iterable): """Transform tokens back into Python source code. diff --git a/Lib/lib2to3/pytree.py b/Lib/lib2to3/pytree.py index 9de810e..c60f107 100644 --- a/Lib/lib2to3/pytree.py +++ b/Lib/lib2to3/pytree.py @@ -216,6 +216,10 @@ class Base(object): return "" return next_sib.get_prefix() + if sys.version_info < (3, 0): + def __str__(self): + return str(self).encode("ascii") + class Node(Base): @@ -245,7 +249,7 @@ class Node(Base): type_repr(self.type), self.children) - def __str__(self): + def __unicode__(self): """ Return a pretty string representation. @@ -253,6 +257,9 @@ class Node(Base): """ return "".join(map(str, self.children)) + if sys.version_info > (3, 0): + __str__ = __unicode__ + def _eq(self, other): """Compare two nodes for equality.""" return (self.type, self.children) == (other.type, other.children) @@ -353,7 +360,7 @@ class Leaf(Base): self.type, self.value) - def __str__(self): + def __unicode__(self): """ Return a pretty string representation. @@ -361,6 +368,9 @@ class Leaf(Base): """ return self.prefix + str(self.value) + if sys.version_info > (3, 0): + __str__ = __unicode__ + def _eq(self, other): """Compare two nodes for equality.""" return (self.type, self.value) == (other.type, other.value) diff --git a/Lib/lib2to3/refactor.py b/Lib/lib2to3/refactor.py index b679db4..82a98d1 100755 --- a/Lib/lib2to3/refactor.py +++ b/Lib/lib2to3/refactor.py @@ -22,8 +22,7 @@ from collections import defaultdict from itertools import chain # Local imports -from .pgen2 import driver -from .pgen2 import tokenize +from .pgen2 import driver, tokenize from . import pytree from . import patcomp @@ -87,6 +86,25 @@ def get_fixers_from_package(pkg_name): return [pkg_name + "." + fix_name for fix_name in get_all_fix_names(pkg_name, False)] +def _identity(obj): + return obj + +if sys.version_info < (3, 0): + import codecs + _open_with_encoding = codecs.open + # codecs.open doesn't translate newlines sadly. + def _from_system_newlines(input): + return input.replace("\r\n", "\n") + def _to_system_newlines(input): + if os.linesep != "\n": + return input.replace("\n", os.linesep) + else: + return input +else: + _open_with_encoding = open + _from_system_newlines = _identity + _to_system_newlines = _identity + class FixerError(Exception): """A fixer could not be loaded.""" @@ -213,29 +231,42 @@ class RefactoringTool(object): # Modify dirnames in-place to remove subdirs with leading dots dirnames[:] = [dn for dn in dirnames if not dn.startswith(".")] - def refactor_file(self, filename, write=False, doctests_only=False): - """Refactors a file.""" + def _read_python_source(self, filename): + """ + Do our best to decode a Python source file correctly. + """ try: - f = open(filename) + f = open(filename, "rb") except IOError as err: self.log_error("Can't open %s: %s", filename, err) - return + return None, None try: - input = f.read() + "\n" # Silence certain parse errors + encoding = tokenize.detect_encoding(f.readline)[0] finally: f.close() + with _open_with_encoding(filename, "r", encoding=encoding) as f: + return _from_system_newlines(f.read()), encoding + + def refactor_file(self, filename, write=False, doctests_only=False): + """Refactors a file.""" + input, encoding = self._read_python_source(filename) + if input is None: + # Reading the file failed. + return + input += "\n" # Silence certain parse errors if doctests_only: self.log_debug("Refactoring doctests in %s", filename) output = self.refactor_docstring(input, filename) if output != input: - self.processed_file(output, filename, input, write=write) + self.processed_file(output, filename, input, write, encoding) else: self.log_debug("No doctest changes in %s", filename) else: tree = self.refactor_string(input, filename) if tree and tree.was_changed: # The [:-1] is to take off the \n we added earlier - self.processed_file(str(tree)[:-1], filename, write=write) + self.processed_file(str(tree)[:-1], filename, + write=write, encoding=encoding) else: self.log_debug("No changes in %s", filename) @@ -321,31 +352,26 @@ class RefactoringTool(object): node.replace(new) node = new - def processed_file(self, new_text, filename, old_text=None, write=False): + def processed_file(self, new_text, filename, old_text=None, write=False, + encoding=None): """ Called when a file has been refactored, and there are changes. """ self.files.append(filename) if old_text is None: - try: - f = open(filename, "r") - except IOError as err: - self.log_error("Can't read %s: %s", filename, err) + old_text = self._read_python_source(filename)[0] + if old_text is None: return - try: - old_text = f.read() - finally: - f.close() if old_text == new_text: self.log_debug("No changes to %s", filename) return self.print_output(diff_texts(old_text, new_text, filename)) if write: - self.write_file(new_text, filename, old_text) + self.write_file(new_text, filename, old_text, encoding) else: self.log_debug("Not writing changes to %s", filename) - def write_file(self, new_text, filename, old_text): + def write_file(self, new_text, filename, old_text, encoding=None): """Writes a string to a file. It first shows a unified diff between the old text and the new text, and @@ -353,12 +379,12 @@ class RefactoringTool(object): set. """ try: - f = open(filename, "w") + f = _open_with_encoding(filename, "w", encoding=encoding) except os.error as err: self.log_error("Can't create %s: %s", filename, err) return try: - f.write(new_text) + f.write(_to_system_newlines(new_text)) except os.error as err: self.log_error("Can't write %s: %s", filename, err) finally: diff --git a/Lib/lib2to3/tests/data/crlf.py b/Lib/lib2to3/tests/data/crlf.py new file mode 100644 index 0000000..dbe2d7b --- /dev/null +++ b/Lib/lib2to3/tests/data/crlf.py @@ -0,0 +1,3 @@ +print "hi" + +print "Like bad Windows newlines?" diff --git a/Lib/lib2to3/tests/data/different_encoding.py b/Lib/lib2to3/tests/data/different_encoding.py new file mode 100644 index 0000000..4bb82bd --- /dev/null +++ b/Lib/lib2to3/tests/data/different_encoding.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +# -*- coding: iso-8859-1 -*- +print(u'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') + diff --git a/Lib/lib2to3/tests/support.py b/Lib/lib2to3/tests/support.py index 7abf2ef..8b8468c 100644 --- a/Lib/lib2to3/tests/support.py +++ b/Lib/lib2to3/tests/support.py @@ -9,12 +9,9 @@ import os.path import re from textwrap import dedent -#sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) - # Local imports -from .. import pytree -from .. import refactor -from ..pgen2 import driver +from lib2to3 import pytree, refactor +from lib2to3.pgen2 import driver test_dir = os.path.dirname(__file__) proj_dir = os.path.normpath(os.path.join(test_dir, "..")) @@ -25,12 +22,6 @@ driver = driver.Driver(grammar, convert=pytree.convert) def parse_string(string): return driver.parse_string(reformat(string), debug=True) -# Python 2.3's TestSuite is not iter()-able -if sys.version_info < (2, 4): - def TestSuite_iter(self): - return iter(self._tests) - unittest.TestSuite.__iter__ = TestSuite_iter - def run_all_tests(test_mod=None, tests=None): if tests is None: tests = unittest.TestLoader().loadTestsFromModule(test_mod) diff --git a/Lib/lib2to3/tests/test_all_fixers.py b/Lib/lib2to3/tests/test_all_fixers.py index 68d6306..1795ade 100644 --- a/Lib/lib2to3/tests/test_all_fixers.py +++ b/Lib/lib2to3/tests/test_all_fixers.py @@ -28,7 +28,7 @@ class Test_all(support.TestCase): def test_all_project_files(self): for filepath in support.all_project_files(): print("Fixing %s..." % filepath) - self.refactor.refactor_string(open(filepath).read(), filepath) + self.refactor.refactor_file(filepath) if __name__ == "__main__": diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py index 83aa812..2aa737c 100644 --- a/Lib/lib2to3/tests/test_parser.py +++ b/Lib/lib2to3/tests/test_parser.py @@ -14,9 +14,9 @@ from .support import driver, test_dir # Python imports import os -import os.path # Local imports +from lib2to3.pgen2 import tokenize from ..pgen2.parse import ParseError @@ -150,13 +150,25 @@ class TestParserIdempotency(support.TestCase): def test_all_project_files(self): for filepath in support.all_project_files(): print("Parsing %s..." % filepath) - tree = driver.parse_file(filepath, debug=True) - if diff(filepath, tree): + with open(filepath, "rb") as fp: + encoding = tokenize.detect_encoding(fp.readline)[0] + fp.seek(0) + source = fp.read() + if encoding: + source = source.decode(encoding) + tree = driver.parse_string(source) + new = str(tree) + if encoding: + new = new.encode(encoding) + if diff(filepath, new): self.fail("Idempotency failed: %s" % filepath) class TestLiterals(GrammarTest): + def validate(self, s): + driver.parse_string(support.dedent(s) + "\n\n") + def test_multiline_bytes_literals(self): s = """ md5test(b"\xaa" * 80, @@ -185,10 +197,10 @@ class TestLiterals(GrammarTest): self.validate(s) -def diff(fn, tree): +def diff(fn, result): f = open("@", "w") try: - f.write(str(tree)) + f.write(result) finally: f.close() try: diff --git a/Lib/lib2to3/tests/test_refactor.py b/Lib/lib2to3/tests/test_refactor.py index 5a49f01..e55f555 100644 --- a/Lib/lib2to3/tests/test_refactor.py +++ b/Lib/lib2to3/tests/test_refactor.py @@ -14,7 +14,8 @@ from lib2to3 import refactor, pygram, fixer_base from . import support -FIXER_DIR = os.path.join(os.path.dirname(__file__), "data/fixers") +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") +FIXER_DIR = os.path.join(TEST_DATA_DIR, "fixers") sys.path.append(FIXER_DIR) try: @@ -22,6 +23,8 @@ try: finally: sys.path.pop() +_2TO3_FIXERS = refactor.get_fixers_from_package("lib2to3.fixes") + class TestRefactoringTool(unittest.TestCase): def setUp(self): @@ -121,19 +124,40 @@ class TestRefactoringTool(unittest.TestCase): +def cheese(): pass""".splitlines() self.assertEqual(diff_lines[:-1], expected) + def check_file_refactoring(self, test_file, fixers=_2TO3_FIXERS): + def read_file(): + with open(test_file, "rb") as fp: + return fp.read() + old_contents = read_file() + rt = self.rt(fixers=fixers) + + rt.refactor_file(test_file) + self.assertEqual(old_contents, read_file()) + + try: + rt.refactor_file(test_file, True) + self.assertNotEqual(old_contents, read_file()) + finally: + with open(test_file, "wb") as fp: + fp.write(old_contents) + def test_refactor_file(self): test_file = os.path.join(FIXER_DIR, "parrot_example.py") - old_contents = open(test_file, "r").read() - rt = self.rt() + self.check_file_refactoring(test_file, _DEFAULT_FIXERS) - rt.refactor_file(test_file) - self.assertEqual(old_contents, open(test_file, "r").read()) + def test_file_encoding(self): + fn = os.path.join(TEST_DATA_DIR, "different_encoding.py") + self.check_file_refactoring(fn) - rt.refactor_file(test_file, True) + def test_crlf_newlines(self): + old_sep = os.linesep + os.linesep = "\r\n" try: - self.assertNotEqual(old_contents, open(test_file, "r").read()) + fn = os.path.join(TEST_DATA_DIR, "crlf.py") + fixes = refactor.get_fixers_from_package("lib2to3.fixes") + self.check_file_refactoring(fn, fixes) finally: - open(test_file, "w").write(old_contents) + os.linesep = old_sep def test_refactor_docstring(self): rt = self.rt() -- cgit v0.12