summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2009-05-09 19:42:23 (GMT)
committerBenjamin Peterson <benjamin@python.org>2009-05-09 19:42:23 (GMT)
commitd481e3d7914d20238c62c76991255b3b2b5e4a17 (patch)
treefb9a3831c561486f09fde515d41410c3f8753007
parentb0ba27dff1442fe6dc7b00ce7d8488afb159d9b8 (diff)
downloadcpython-d481e3d7914d20238c62c76991255b3b2b5e4a17.zip
cpython-d481e3d7914d20238c62c76991255b3b2b5e4a17.tar.gz
cpython-d481e3d7914d20238c62c76991255b3b2b5e4a17.tar.bz2
Merged revisions 72494 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ................ r72494 | benjamin.peterson | 2009-05-08 20:01:14 -0500 (Fri, 08 May 2009) | 21 lines Merged revisions 72491-72493 via svnmerge from svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3 ........ r72491 | benjamin.peterson | 2009-05-08 19:33:27 -0500 (Fri, 08 May 2009) | 7 lines make 2to3 use unicode internally on 2.x This started out as a fix for #2660, but became this large refactoring when I realized the dire state this was in. 2to3 now uses tokenize.detect_encoding to decode the files correctly into unicode. ........ r72492 | benjamin.peterson | 2009-05-08 19:35:38 -0500 (Fri, 08 May 2009) | 1 line remove compat code ........ r72493 | benjamin.peterson | 2009-05-08 19:54:15 -0500 (Fri, 08 May 2009) | 1 line add a test for \r\n newlines ........ ................
-rw-r--r--Lib/lib2to3/fixes/fix_imports.py2
-rw-r--r--Lib/lib2to3/fixes/fix_methodattrs.py2
-rw-r--r--Lib/lib2to3/fixes/fix_renames.py2
-rw-r--r--Lib/lib2to3/fixes/fix_types.py2
-rw-r--r--Lib/lib2to3/main.py6
-rw-r--r--Lib/lib2to3/patcomp.py2
-rw-r--r--Lib/lib2to3/pgen2/driver.py5
-rw-r--r--Lib/lib2to3/pgen2/tokenize.py70
-rw-r--r--Lib/lib2to3/pytree.py14
-rwxr-xr-xLib/lib2to3/refactor.py70
-rw-r--r--Lib/lib2to3/tests/data/crlf.py3
-rw-r--r--Lib/lib2to3/tests/data/different_encoding.py4
-rw-r--r--Lib/lib2to3/tests/support.py13
-rw-r--r--Lib/lib2to3/tests/test_all_fixers.py2
-rw-r--r--Lib/lib2to3/tests/test_parser.py22
-rw-r--r--Lib/lib2to3/tests/test_refactor.py40
16 files changed, 200 insertions, 59 deletions
diff --git a/Lib/lib2to3/fixes/fix_imports.py b/Lib/lib2to3/fixes/fix_imports.py
index 46ba4a2..f79ad63 100644
--- a/Lib/lib2to3/fixes/fix_imports.py
+++ b/Lib/lib2to3/fixes/fix_imports.py
@@ -123,7 +123,7 @@ class FixImports(fixer_base.BaseFix):
import_mod = results.get("module_name")
if import_mod:
mod_name = import_mod.value
- new_name = self.mapping[mod_name]
+ new_name = str(self.mapping[mod_name])
import_mod.replace(Name(new_name, prefix=import_mod.get_prefix()))
if "name_import" in results:
# If it's not a "from x import x, y" or "import x as y" import,
diff --git a/Lib/lib2to3/fixes/fix_methodattrs.py b/Lib/lib2to3/fixes/fix_methodattrs.py
index ae4096c..814455e 100644
--- a/Lib/lib2to3/fixes/fix_methodattrs.py
+++ b/Lib/lib2to3/fixes/fix_methodattrs.py
@@ -19,5 +19,5 @@ class FixMethodattrs(fixer_base.BaseFix):
def transform(self, node, results):
attr = results["attr"][0]
- new = MAP[attr.value]
+ new = str(MAP[attr.value])
attr.replace(Name(new, prefix=attr.get_prefix()))
diff --git a/Lib/lib2to3/fixes/fix_renames.py b/Lib/lib2to3/fixes/fix_renames.py
index 3049610..a85813f 100644
--- a/Lib/lib2to3/fixes/fix_renames.py
+++ b/Lib/lib2to3/fixes/fix_renames.py
@@ -65,5 +65,5 @@ class FixRenames(fixer_base.BaseFix):
#import_mod = results.get("module")
if mod_name and attr_name:
- new_attr = LOOKUP[(mod_name.value, attr_name.value)]
+ new_attr = str(LOOKUP[(mod_name.value, attr_name.value)])
attr_name.replace(Name(new_attr, prefix=attr_name.get_prefix()))
diff --git a/Lib/lib2to3/fixes/fix_types.py b/Lib/lib2to3/fixes/fix_types.py
index 445f1b2..59fd011 100644
--- a/Lib/lib2to3/fixes/fix_types.py
+++ b/Lib/lib2to3/fixes/fix_types.py
@@ -56,7 +56,7 @@ class FixTypes(fixer_base.BaseFix):
PATTERN = '|'.join(_pats)
def transform(self, node, results):
- new_value = _TYPE_MAPPING.get(results["name"].value)
+ new_value = str(_TYPE_MAPPING.get(results["name"].value))
if new_value:
return Name(new_value, prefix=node.get_prefix())
return None
diff --git a/Lib/lib2to3/main.py b/Lib/lib2to3/main.py
index 084fc0c..e1adc88 100644
--- a/Lib/lib2to3/main.py
+++ b/Lib/lib2to3/main.py
@@ -23,7 +23,7 @@ class StdoutRefactoringTool(refactor.MultiprocessRefactoringTool):
self.errors.append((msg, args, kwargs))
self.logger.error(msg, *args, **kwargs)
- def write_file(self, new_text, filename, old_text):
+ def write_file(self, new_text, filename, old_text, encoding):
if not self.nobackups:
# Make backup
backup = filename + ".bak"
@@ -37,8 +37,8 @@ class StdoutRefactoringTool(refactor.MultiprocessRefactoringTool):
except os.error as err:
self.log_message("Can't rename %s to %s", filename, backup)
# Actually write the new file
- super(StdoutRefactoringTool, self).write_file(new_text,
- filename, old_text)
+ write = super(StdoutRefactoringTool, self).write_file
+ write(new_text, filename, old_text, encoding)
if not self.nobackups:
shutil.copymode(backup, filename)
diff --git a/Lib/lib2to3/patcomp.py b/Lib/lib2to3/patcomp.py
index 7826f90..076fdc1 100644
--- a/Lib/lib2to3/patcomp.py
+++ b/Lib/lib2to3/patcomp.py
@@ -133,7 +133,7 @@ class PatternCompiler(object):
assert len(nodes) >= 1
node = nodes[0]
if node.type == token.STRING:
- value = literals.evalString(node.value)
+ value = str(literals.evalString(node.value))
return pytree.LeafPattern(content=value)
elif node.type == token.NAME:
value = node.value
diff --git a/Lib/lib2to3/pgen2/driver.py b/Lib/lib2to3/pgen2/driver.py
index a025b37..ee77a13 100644
--- a/Lib/lib2to3/pgen2/driver.py
+++ b/Lib/lib2to3/pgen2/driver.py
@@ -16,6 +16,7 @@ __author__ = "Guido van Rossum <guido@python.org>"
__all__ = ["Driver", "load_grammar"]
# Python imports
+import codecs
import os
import logging
import sys
@@ -90,9 +91,9 @@ class Driver(object):
"""Parse a stream and return the syntax tree."""
return self.parse_stream_raw(stream, debug)
- def parse_file(self, filename, debug=False):
+ def parse_file(self, filename, encoding=None, debug=False):
"""Parse a file and return the syntax tree."""
- stream = open(filename)
+ stream = codecs.open(filename, "r", encoding)
try:
return self.parse_stream(stream, debug)
finally:
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index 33cfc33..799566b 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -30,6 +30,7 @@ __credits__ = \
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
import string, re
+from codecs import BOM_UTF8, lookup
from lib2to3.pgen2.token import *
from . import token
@@ -228,6 +229,75 @@ class Untokenizer:
startline = False
toks_append(tokval)
+cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
+
+def detect_encoding(readline):
+ """
+ The detect_encoding() function is used to detect the encoding that should
+ be used to decode a Python source file. It requires one argment, readline,
+ in the same way as the tokenize() generator.
+
+ It will call readline a maximum of twice, and return the encoding used
+ (as a string) and a list of any lines (left as bytes) it has read
+ in.
+
+ It detects the encoding from the presence of a utf-8 bom or an encoding
+ cookie as specified in pep-0263. If both a bom and a cookie are present,
+ but disagree, a SyntaxError will be raised. If the encoding cookie is an
+ invalid charset, raise a SyntaxError.
+
+ If no encoding is specified, then the default of 'utf-8' will be returned.
+ """
+ bom_found = False
+ encoding = None
+ def read_or_stop():
+ try:
+ return readline()
+ except StopIteration:
+ return b''
+
+ def find_cookie(line):
+ try:
+ line_string = line.decode('ascii')
+ except UnicodeDecodeError:
+ return None
+
+ matches = cookie_re.findall(line_string)
+ if not matches:
+ return None
+ encoding = matches[0]
+ try:
+ codec = lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError("unknown encoding: " + encoding)
+
+ if bom_found and codec.name != 'utf-8':
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError('encoding problem: utf-8')
+ return encoding
+
+ first = read_or_stop()
+ if first.startswith(BOM_UTF8):
+ bom_found = True
+ first = first[3:]
+ if not first:
+ return 'utf-8', []
+
+ encoding = find_cookie(first)
+ if encoding:
+ return encoding, [first]
+
+ second = read_or_stop()
+ if not second:
+ return 'utf-8', [first]
+
+ encoding = find_cookie(second)
+ if encoding:
+ return encoding, [first, second]
+
+ return 'utf-8', [first, second]
+
def untokenize(iterable):
"""Transform tokens back into Python source code.
diff --git a/Lib/lib2to3/pytree.py b/Lib/lib2to3/pytree.py
index 9de810e..c60f107 100644
--- a/Lib/lib2to3/pytree.py
+++ b/Lib/lib2to3/pytree.py
@@ -216,6 +216,10 @@ class Base(object):
return ""
return next_sib.get_prefix()
+ if sys.version_info < (3, 0):
+ def __str__(self):
+ return str(self).encode("ascii")
+
class Node(Base):
@@ -245,7 +249,7 @@ class Node(Base):
type_repr(self.type),
self.children)
- def __str__(self):
+ def __unicode__(self):
"""
Return a pretty string representation.
@@ -253,6 +257,9 @@ class Node(Base):
"""
return "".join(map(str, self.children))
+ if sys.version_info > (3, 0):
+ __str__ = __unicode__
+
def _eq(self, other):
"""Compare two nodes for equality."""
return (self.type, self.children) == (other.type, other.children)
@@ -353,7 +360,7 @@ class Leaf(Base):
self.type,
self.value)
- def __str__(self):
+ def __unicode__(self):
"""
Return a pretty string representation.
@@ -361,6 +368,9 @@ class Leaf(Base):
"""
return self.prefix + str(self.value)
+ if sys.version_info > (3, 0):
+ __str__ = __unicode__
+
def _eq(self, other):
"""Compare two nodes for equality."""
return (self.type, self.value) == (other.type, other.value)
diff --git a/Lib/lib2to3/refactor.py b/Lib/lib2to3/refactor.py
index b679db4..82a98d1 100755
--- a/Lib/lib2to3/refactor.py
+++ b/Lib/lib2to3/refactor.py
@@ -22,8 +22,7 @@ from collections import defaultdict
from itertools import chain
# Local imports
-from .pgen2 import driver
-from .pgen2 import tokenize
+from .pgen2 import driver, tokenize
from . import pytree
from . import patcomp
@@ -87,6 +86,25 @@ def get_fixers_from_package(pkg_name):
return [pkg_name + "." + fix_name
for fix_name in get_all_fix_names(pkg_name, False)]
+def _identity(obj):
+ return obj
+
+if sys.version_info < (3, 0):
+ import codecs
+ _open_with_encoding = codecs.open
+ # codecs.open doesn't translate newlines sadly.
+ def _from_system_newlines(input):
+ return input.replace("\r\n", "\n")
+ def _to_system_newlines(input):
+ if os.linesep != "\n":
+ return input.replace("\n", os.linesep)
+ else:
+ return input
+else:
+ _open_with_encoding = open
+ _from_system_newlines = _identity
+ _to_system_newlines = _identity
+
class FixerError(Exception):
"""A fixer could not be loaded."""
@@ -213,29 +231,42 @@ class RefactoringTool(object):
# Modify dirnames in-place to remove subdirs with leading dots
dirnames[:] = [dn for dn in dirnames if not dn.startswith(".")]
- def refactor_file(self, filename, write=False, doctests_only=False):
- """Refactors a file."""
+ def _read_python_source(self, filename):
+ """
+ Do our best to decode a Python source file correctly.
+ """
try:
- f = open(filename)
+ f = open(filename, "rb")
except IOError as err:
self.log_error("Can't open %s: %s", filename, err)
- return
+ return None, None
try:
- input = f.read() + "\n" # Silence certain parse errors
+ encoding = tokenize.detect_encoding(f.readline)[0]
finally:
f.close()
+ with _open_with_encoding(filename, "r", encoding=encoding) as f:
+ return _from_system_newlines(f.read()), encoding
+
+ def refactor_file(self, filename, write=False, doctests_only=False):
+ """Refactors a file."""
+ input, encoding = self._read_python_source(filename)
+ if input is None:
+ # Reading the file failed.
+ return
+ input += "\n" # Silence certain parse errors
if doctests_only:
self.log_debug("Refactoring doctests in %s", filename)
output = self.refactor_docstring(input, filename)
if output != input:
- self.processed_file(output, filename, input, write=write)
+ self.processed_file(output, filename, input, write, encoding)
else:
self.log_debug("No doctest changes in %s", filename)
else:
tree = self.refactor_string(input, filename)
if tree and tree.was_changed:
# The [:-1] is to take off the \n we added earlier
- self.processed_file(str(tree)[:-1], filename, write=write)
+ self.processed_file(str(tree)[:-1], filename,
+ write=write, encoding=encoding)
else:
self.log_debug("No changes in %s", filename)
@@ -321,31 +352,26 @@ class RefactoringTool(object):
node.replace(new)
node = new
- def processed_file(self, new_text, filename, old_text=None, write=False):
+ def processed_file(self, new_text, filename, old_text=None, write=False,
+ encoding=None):
"""
Called when a file has been refactored, and there are changes.
"""
self.files.append(filename)
if old_text is None:
- try:
- f = open(filename, "r")
- except IOError as err:
- self.log_error("Can't read %s: %s", filename, err)
+ old_text = self._read_python_source(filename)[0]
+ if old_text is None:
return
- try:
- old_text = f.read()
- finally:
- f.close()
if old_text == new_text:
self.log_debug("No changes to %s", filename)
return
self.print_output(diff_texts(old_text, new_text, filename))
if write:
- self.write_file(new_text, filename, old_text)
+ self.write_file(new_text, filename, old_text, encoding)
else:
self.log_debug("Not writing changes to %s", filename)
- def write_file(self, new_text, filename, old_text):
+ def write_file(self, new_text, filename, old_text, encoding=None):
"""Writes a string to a file.
It first shows a unified diff between the old text and the new text, and
@@ -353,12 +379,12 @@ class RefactoringTool(object):
set.
"""
try:
- f = open(filename, "w")
+ f = _open_with_encoding(filename, "w", encoding=encoding)
except os.error as err:
self.log_error("Can't create %s: %s", filename, err)
return
try:
- f.write(new_text)
+ f.write(_to_system_newlines(new_text))
except os.error as err:
self.log_error("Can't write %s: %s", filename, err)
finally:
diff --git a/Lib/lib2to3/tests/data/crlf.py b/Lib/lib2to3/tests/data/crlf.py
new file mode 100644
index 0000000..dbe2d7b
--- /dev/null
+++ b/Lib/lib2to3/tests/data/crlf.py
@@ -0,0 +1,3 @@
+print "hi"
+
+print "Like bad Windows newlines?"
diff --git a/Lib/lib2to3/tests/data/different_encoding.py b/Lib/lib2to3/tests/data/different_encoding.py
new file mode 100644
index 0000000..4bb82bd
--- /dev/null
+++ b/Lib/lib2to3/tests/data/different_encoding.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- coding: iso-8859-1 -*-
+print(u'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
+
diff --git a/Lib/lib2to3/tests/support.py b/Lib/lib2to3/tests/support.py
index 7abf2ef..8b8468c 100644
--- a/Lib/lib2to3/tests/support.py
+++ b/Lib/lib2to3/tests/support.py
@@ -9,12 +9,9 @@ import os.path
import re
from textwrap import dedent
-#sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-
# Local imports
-from .. import pytree
-from .. import refactor
-from ..pgen2 import driver
+from lib2to3 import pytree, refactor
+from lib2to3.pgen2 import driver
test_dir = os.path.dirname(__file__)
proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
@@ -25,12 +22,6 @@ driver = driver.Driver(grammar, convert=pytree.convert)
def parse_string(string):
return driver.parse_string(reformat(string), debug=True)
-# Python 2.3's TestSuite is not iter()-able
-if sys.version_info < (2, 4):
- def TestSuite_iter(self):
- return iter(self._tests)
- unittest.TestSuite.__iter__ = TestSuite_iter
-
def run_all_tests(test_mod=None, tests=None):
if tests is None:
tests = unittest.TestLoader().loadTestsFromModule(test_mod)
diff --git a/Lib/lib2to3/tests/test_all_fixers.py b/Lib/lib2to3/tests/test_all_fixers.py
index 68d6306..1795ade 100644
--- a/Lib/lib2to3/tests/test_all_fixers.py
+++ b/Lib/lib2to3/tests/test_all_fixers.py
@@ -28,7 +28,7 @@ class Test_all(support.TestCase):
def test_all_project_files(self):
for filepath in support.all_project_files():
print("Fixing %s..." % filepath)
- self.refactor.refactor_string(open(filepath).read(), filepath)
+ self.refactor.refactor_file(filepath)
if __name__ == "__main__":
diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py
index 83aa812..2aa737c 100644
--- a/Lib/lib2to3/tests/test_parser.py
+++ b/Lib/lib2to3/tests/test_parser.py
@@ -14,9 +14,9 @@ from .support import driver, test_dir
# Python imports
import os
-import os.path
# Local imports
+from lib2to3.pgen2 import tokenize
from ..pgen2.parse import ParseError
@@ -150,13 +150,25 @@ class TestParserIdempotency(support.TestCase):
def test_all_project_files(self):
for filepath in support.all_project_files():
print("Parsing %s..." % filepath)
- tree = driver.parse_file(filepath, debug=True)
- if diff(filepath, tree):
+ with open(filepath, "rb") as fp:
+ encoding = tokenize.detect_encoding(fp.readline)[0]
+ fp.seek(0)
+ source = fp.read()
+ if encoding:
+ source = source.decode(encoding)
+ tree = driver.parse_string(source)
+ new = str(tree)
+ if encoding:
+ new = new.encode(encoding)
+ if diff(filepath, new):
self.fail("Idempotency failed: %s" % filepath)
class TestLiterals(GrammarTest):
+ def validate(self, s):
+ driver.parse_string(support.dedent(s) + "\n\n")
+
def test_multiline_bytes_literals(self):
s = """
md5test(b"\xaa" * 80,
@@ -185,10 +197,10 @@ class TestLiterals(GrammarTest):
self.validate(s)
-def diff(fn, tree):
+def diff(fn, result):
f = open("@", "w")
try:
- f.write(str(tree))
+ f.write(result)
finally:
f.close()
try:
diff --git a/Lib/lib2to3/tests/test_refactor.py b/Lib/lib2to3/tests/test_refactor.py
index 5a49f01..e55f555 100644
--- a/Lib/lib2to3/tests/test_refactor.py
+++ b/Lib/lib2to3/tests/test_refactor.py
@@ -14,7 +14,8 @@ from lib2to3 import refactor, pygram, fixer_base
from . import support
-FIXER_DIR = os.path.join(os.path.dirname(__file__), "data/fixers")
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+FIXER_DIR = os.path.join(TEST_DATA_DIR, "fixers")
sys.path.append(FIXER_DIR)
try:
@@ -22,6 +23,8 @@ try:
finally:
sys.path.pop()
+_2TO3_FIXERS = refactor.get_fixers_from_package("lib2to3.fixes")
+
class TestRefactoringTool(unittest.TestCase):
def setUp(self):
@@ -121,19 +124,40 @@ class TestRefactoringTool(unittest.TestCase):
+def cheese(): pass""".splitlines()
self.assertEqual(diff_lines[:-1], expected)
+ def check_file_refactoring(self, test_file, fixers=_2TO3_FIXERS):
+ def read_file():
+ with open(test_file, "rb") as fp:
+ return fp.read()
+ old_contents = read_file()
+ rt = self.rt(fixers=fixers)
+
+ rt.refactor_file(test_file)
+ self.assertEqual(old_contents, read_file())
+
+ try:
+ rt.refactor_file(test_file, True)
+ self.assertNotEqual(old_contents, read_file())
+ finally:
+ with open(test_file, "wb") as fp:
+ fp.write(old_contents)
+
def test_refactor_file(self):
test_file = os.path.join(FIXER_DIR, "parrot_example.py")
- old_contents = open(test_file, "r").read()
- rt = self.rt()
+ self.check_file_refactoring(test_file, _DEFAULT_FIXERS)
- rt.refactor_file(test_file)
- self.assertEqual(old_contents, open(test_file, "r").read())
+ def test_file_encoding(self):
+ fn = os.path.join(TEST_DATA_DIR, "different_encoding.py")
+ self.check_file_refactoring(fn)
- rt.refactor_file(test_file, True)
+ def test_crlf_newlines(self):
+ old_sep = os.linesep
+ os.linesep = "\r\n"
try:
- self.assertNotEqual(old_contents, open(test_file, "r").read())
+ fn = os.path.join(TEST_DATA_DIR, "crlf.py")
+ fixes = refactor.get_fixers_from_package("lib2to3.fixes")
+ self.check_file_refactoring(fn, fixes)
finally:
- open(test_file, "w").write(old_contents)
+ os.linesep = old_sep
def test_refactor_docstring(self):
rt = self.rt()