From d481e3d7914d20238c62c76991255b3b2b5e4a17 Mon Sep 17 00:00:00 2001
From: Benjamin Peterson <benjamin@python.org>
Date: Sat, 9 May 2009 19:42:23 +0000
Subject: Merged revisions 72494 via svnmerge from
 svn+ssh://pythondev@svn.python.org/python/trunk

................
  r72494 | benjamin.peterson | 2009-05-08 20:01:14 -0500 (Fri, 08 May 2009) | 21 lines

  Merged revisions 72491-72493 via svnmerge from
  svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3

  ........
    r72491 | benjamin.peterson | 2009-05-08 19:33:27 -0500 (Fri, 08 May 2009) | 7 lines

    make 2to3 use unicode internally on 2.x

    This started out as a fix for #2660, but became this large refactoring
    when I realized the dire state this was in. 2to3 now uses
    tokenize.detect_encoding to decode the files correctly into unicode.
  ........
    r72492 | benjamin.peterson | 2009-05-08 19:35:38 -0500 (Fri, 08 May 2009) | 1 line

    remove compat code
  ........
    r72493 | benjamin.peterson | 2009-05-08 19:54:15 -0500 (Fri, 08 May 2009) | 1 line

    add a test for \r\n newlines
  ........
................
---
 Lib/lib2to3/fixes/fix_imports.py             |  2 +-
 Lib/lib2to3/fixes/fix_methodattrs.py         |  2 +-
 Lib/lib2to3/fixes/fix_renames.py             |  2 +-
 Lib/lib2to3/fixes/fix_types.py               |  2 +-
 Lib/lib2to3/main.py                          |  6 +--
 Lib/lib2to3/patcomp.py                       |  2 +-
 Lib/lib2to3/pgen2/driver.py                  |  5 +-
 Lib/lib2to3/pgen2/tokenize.py                | 70 ++++++++++++++++++++++++++++
 Lib/lib2to3/pytree.py                        | 14 +++++-
 Lib/lib2to3/refactor.py                      | 70 +++++++++++++++++++---------
 Lib/lib2to3/tests/data/crlf.py               |  3 ++
 Lib/lib2to3/tests/data/different_encoding.py |  4 ++
 Lib/lib2to3/tests/support.py                 | 13 +-----
 Lib/lib2to3/tests/test_all_fixers.py         |  2 +-
 Lib/lib2to3/tests/test_parser.py             | 22 +++++++--
 Lib/lib2to3/tests/test_refactor.py           | 40 ++++++++++++----
 16 files changed, 200 insertions(+), 59 deletions(-)
 create mode 100644 Lib/lib2to3/tests/data/crlf.py
 create mode 100644 Lib/lib2to3/tests/data/different_encoding.py

diff --git a/Lib/lib2to3/fixes/fix_imports.py b/Lib/lib2to3/fixes/fix_imports.py
index 46ba4a2..f79ad63 100644
--- a/Lib/lib2to3/fixes/fix_imports.py
+++ b/Lib/lib2to3/fixes/fix_imports.py
@@ -123,7 +123,7 @@ class FixImports(fixer_base.BaseFix):
         import_mod = results.get("module_name")
         if import_mod:
             mod_name = import_mod.value
-            new_name = self.mapping[mod_name]
+            new_name = str(self.mapping[mod_name])
             import_mod.replace(Name(new_name, prefix=import_mod.get_prefix()))
             if "name_import" in results:
                 # If it's not a "from x import x, y" or "import x as y" import,
diff --git a/Lib/lib2to3/fixes/fix_methodattrs.py b/Lib/lib2to3/fixes/fix_methodattrs.py
index ae4096c..814455e 100644
--- a/Lib/lib2to3/fixes/fix_methodattrs.py
+++ b/Lib/lib2to3/fixes/fix_methodattrs.py
@@ -19,5 +19,5 @@ class FixMethodattrs(fixer_base.BaseFix):
 
     def transform(self, node, results):
         attr = results["attr"][0]
-        new = MAP[attr.value]
+        new = str(MAP[attr.value])
         attr.replace(Name(new, prefix=attr.get_prefix()))
diff --git a/Lib/lib2to3/fixes/fix_renames.py b/Lib/lib2to3/fixes/fix_renames.py
index 3049610..a85813f 100644
--- a/Lib/lib2to3/fixes/fix_renames.py
+++ b/Lib/lib2to3/fixes/fix_renames.py
@@ -65,5 +65,5 @@ class FixRenames(fixer_base.BaseFix):
         #import_mod = results.get("module")
 
         if mod_name and attr_name:
-            new_attr = LOOKUP[(mod_name.value, attr_name.value)]
+            new_attr = str(LOOKUP[(mod_name.value, attr_name.value)])
             attr_name.replace(Name(new_attr, prefix=attr_name.get_prefix()))
diff --git a/Lib/lib2to3/fixes/fix_types.py b/Lib/lib2to3/fixes/fix_types.py
index 445f1b2..59fd011 100644
--- a/Lib/lib2to3/fixes/fix_types.py
+++ b/Lib/lib2to3/fixes/fix_types.py
@@ -56,7 +56,7 @@ class FixTypes(fixer_base.BaseFix):
     PATTERN = '|'.join(_pats)
 
     def transform(self, node, results):
-        new_value = _TYPE_MAPPING.get(results["name"].value)
+        new_value = str(_TYPE_MAPPING.get(results["name"].value))
         if new_value:
             return Name(new_value, prefix=node.get_prefix())
         return None
diff --git a/Lib/lib2to3/main.py b/Lib/lib2to3/main.py
index 084fc0c..e1adc88 100644
--- a/Lib/lib2to3/main.py
+++ b/Lib/lib2to3/main.py
@@ -23,7 +23,7 @@ class StdoutRefactoringTool(refactor.MultiprocessRefactoringTool):
         self.errors.append((msg, args, kwargs))
         self.logger.error(msg, *args, **kwargs)
 
-    def write_file(self, new_text, filename, old_text):
+    def write_file(self, new_text, filename, old_text, encoding):
         if not self.nobackups:
             # Make backup
             backup = filename + ".bak"
@@ -37,8 +37,8 @@ class StdoutRefactoringTool(refactor.MultiprocessRefactoringTool):
             except os.error as err:
                 self.log_message("Can't rename %s to %s", filename, backup)
         # Actually write the new file
-        super(StdoutRefactoringTool, self).write_file(new_text,
-                                                      filename, old_text)
+        write = super(StdoutRefactoringTool, self).write_file
+        write(new_text, filename, old_text, encoding)
         if not self.nobackups:
             shutil.copymode(backup, filename)
 
diff --git a/Lib/lib2to3/patcomp.py b/Lib/lib2to3/patcomp.py
index 7826f90..076fdc1 100644
--- a/Lib/lib2to3/patcomp.py
+++ b/Lib/lib2to3/patcomp.py
@@ -133,7 +133,7 @@ class PatternCompiler(object):
         assert len(nodes) >= 1
         node = nodes[0]
         if node.type == token.STRING:
-            value = literals.evalString(node.value)
+            value = str(literals.evalString(node.value))
             return pytree.LeafPattern(content=value)
         elif node.type == token.NAME:
             value = node.value
diff --git a/Lib/lib2to3/pgen2/driver.py b/Lib/lib2to3/pgen2/driver.py
index a025b37..ee77a13 100644
--- a/Lib/lib2to3/pgen2/driver.py
+++ b/Lib/lib2to3/pgen2/driver.py
@@ -16,6 +16,7 @@ __author__ = "Guido van Rossum <guido@python.org>"
 __all__ = ["Driver", "load_grammar"]
 
 # Python imports
+import codecs
 import os
 import logging
 import sys
@@ -90,9 +91,9 @@ class Driver(object):
         """Parse a stream and return the syntax tree."""
         return self.parse_stream_raw(stream, debug)
 
-    def parse_file(self, filename, debug=False):
+    def parse_file(self, filename, encoding=None, debug=False):
         """Parse a file and return the syntax tree."""
-        stream = open(filename)
+        stream = codecs.open(filename, "r", encoding)
         try:
             return self.parse_stream(stream, debug)
         finally:
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index 33cfc33..799566b 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -30,6 +30,7 @@ __credits__ = \
     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
 
 import string, re
+from codecs import BOM_UTF8, lookup
 from lib2to3.pgen2.token import *
 
 from . import token
@@ -228,6 +229,75 @@ class Untokenizer:
                 startline = False
             toks_append(tokval)
 
+cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
+
+def detect_encoding(readline):
+    """
+    The detect_encoding() function is used to detect the encoding that should
+    be used to decode a Python source file. It requires one argment, readline,
+    in the same way as the tokenize() generator.
+
+    It will call readline a maximum of twice, and return the encoding used
+    (as a string) and a list of any lines (left as bytes) it has read
+    in.
+
+    It detects the encoding from the presence of a utf-8 bom or an encoding
+    cookie as specified in pep-0263. If both a bom and a cookie are present,
+    but disagree, a SyntaxError will be raised. If the encoding cookie is an
+    invalid charset, raise a SyntaxError.
+
+    If no encoding is specified, then the default of 'utf-8' will be returned.
+    """
+    bom_found = False
+    encoding = None
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return b''
+
+    def find_cookie(line):
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            return None
+
+        matches = cookie_re.findall(line_string)
+        if not matches:
+            return None
+        encoding = matches[0]
+        try:
+            codec = lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found and codec.name != 'utf-8':
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError('encoding problem: utf-8')
+        return encoding
+
+    first = read_or_stop()
+    if first.startswith(BOM_UTF8):
+        bom_found = True
+        first = first[3:]
+    if not first:
+        return 'utf-8', []
+
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding, [first]
+
+    second = read_or_stop()
+    if not second:
+        return 'utf-8', [first]
+
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding, [first, second]
+
+    return 'utf-8', [first, second]
+
 def untokenize(iterable):
     """Transform tokens back into Python source code.
 
diff --git a/Lib/lib2to3/pytree.py b/Lib/lib2to3/pytree.py
index 9de810e..c60f107 100644
--- a/Lib/lib2to3/pytree.py
+++ b/Lib/lib2to3/pytree.py
@@ -216,6 +216,10 @@ class Base(object):
             return ""
         return next_sib.get_prefix()
 
+    if sys.version_info < (3, 0):
+        def __str__(self):
+            return str(self).encode("ascii")
+
 
 class Node(Base):
 
@@ -245,7 +249,7 @@ class Node(Base):
                                type_repr(self.type),
                                self.children)
 
-    def __str__(self):
+    def __unicode__(self):
         """
         Return a pretty string representation.
 
@@ -253,6 +257,9 @@ class Node(Base):
         """
         return "".join(map(str, self.children))
 
+    if sys.version_info > (3, 0):
+        __str__ = __unicode__
+
     def _eq(self, other):
         """Compare two nodes for equality."""
         return (self.type, self.children) == (other.type, other.children)
@@ -353,7 +360,7 @@ class Leaf(Base):
                                self.type,
                                self.value)
 
-    def __str__(self):
+    def __unicode__(self):
         """
         Return a pretty string representation.
 
@@ -361,6 +368,9 @@ class Leaf(Base):
         """
         return self.prefix + str(self.value)
 
+    if sys.version_info > (3, 0):
+        __str__ = __unicode__
+
     def _eq(self, other):
         """Compare two nodes for equality."""
         return (self.type, self.value) == (other.type, other.value)
diff --git a/Lib/lib2to3/refactor.py b/Lib/lib2to3/refactor.py
index b679db4..82a98d1 100755
--- a/Lib/lib2to3/refactor.py
+++ b/Lib/lib2to3/refactor.py
@@ -22,8 +22,7 @@ from collections import defaultdict
 from itertools import chain
 
 # Local imports
-from .pgen2 import driver
-from .pgen2 import tokenize
+from .pgen2 import driver, tokenize
 
 from . import pytree
 from . import patcomp
@@ -87,6 +86,25 @@ def get_fixers_from_package(pkg_name):
     return [pkg_name + "." + fix_name
             for fix_name in get_all_fix_names(pkg_name, False)]
 
+def _identity(obj):
+    return obj
+
+if sys.version_info < (3, 0):
+    import codecs
+    _open_with_encoding = codecs.open
+    # codecs.open doesn't translate newlines sadly.
+    def _from_system_newlines(input):
+        return input.replace("\r\n", "\n")
+    def _to_system_newlines(input):
+        if os.linesep != "\n":
+            return input.replace("\n", os.linesep)
+        else:
+            return input
+else:
+    _open_with_encoding = open
+    _from_system_newlines = _identity
+    _to_system_newlines = _identity
+
 
 class FixerError(Exception):
     """A fixer could not be loaded."""
@@ -213,29 +231,42 @@ class RefactoringTool(object):
             # Modify dirnames in-place to remove subdirs with leading dots
             dirnames[:] = [dn for dn in dirnames if not dn.startswith(".")]
 
-    def refactor_file(self, filename, write=False, doctests_only=False):
-        """Refactors a file."""
+    def _read_python_source(self, filename):
+        """
+        Do our best to decode a Python source file correctly.
+        """
         try:
-            f = open(filename)
+            f = open(filename, "rb")
         except IOError as err:
             self.log_error("Can't open %s: %s", filename, err)
-            return
+            return None, None
         try:
-            input = f.read() + "\n" # Silence certain parse errors
+            encoding = tokenize.detect_encoding(f.readline)[0]
         finally:
             f.close()
+        with _open_with_encoding(filename, "r", encoding=encoding) as f:
+            return _from_system_newlines(f.read()), encoding
+
+    def refactor_file(self, filename, write=False, doctests_only=False):
+        """Refactors a file."""
+        input, encoding = self._read_python_source(filename)
+        if input is None:
+            # Reading the file failed.
+            return
+        input += "\n" # Silence certain parse errors
         if doctests_only:
             self.log_debug("Refactoring doctests in %s", filename)
             output = self.refactor_docstring(input, filename)
             if output != input:
-                self.processed_file(output, filename, input, write=write)
+                self.processed_file(output, filename, input, write, encoding)
             else:
                 self.log_debug("No doctest changes in %s", filename)
         else:
             tree = self.refactor_string(input, filename)
             if tree and tree.was_changed:
                 # The [:-1] is to take off the \n we added earlier
-                self.processed_file(str(tree)[:-1], filename, write=write)
+                self.processed_file(str(tree)[:-1], filename,
+                                    write=write, encoding=encoding)
             else:
                 self.log_debug("No changes in %s", filename)
 
@@ -321,31 +352,26 @@ class RefactoringTool(object):
                         node.replace(new)
                         node = new
 
-    def processed_file(self, new_text, filename, old_text=None, write=False):
+    def processed_file(self, new_text, filename, old_text=None, write=False,
+                       encoding=None):
         """
         Called when a file has been refactored, and there are changes.
         """
         self.files.append(filename)
         if old_text is None:
-            try:
-                f = open(filename, "r")
-            except IOError as err:
-                self.log_error("Can't read %s: %s", filename, err)
+            old_text = self._read_python_source(filename)[0]
+            if old_text is None:
                 return
-            try:
-                old_text = f.read()
-            finally:
-                f.close()
         if old_text == new_text:
             self.log_debug("No changes to %s", filename)
             return
         self.print_output(diff_texts(old_text, new_text, filename))
         if write:
-            self.write_file(new_text, filename, old_text)
+            self.write_file(new_text, filename, old_text, encoding)
         else:
             self.log_debug("Not writing changes to %s", filename)
 
-    def write_file(self, new_text, filename, old_text):
+    def write_file(self, new_text, filename, old_text, encoding=None):
         """Writes a string to a file.
 
         It first shows a unified diff between the old text and the new text, and
@@ -353,12 +379,12 @@ class RefactoringTool(object):
         set.
         """
         try:
-            f = open(filename, "w")
+            f = _open_with_encoding(filename, "w", encoding=encoding)
         except os.error as err:
             self.log_error("Can't create %s: %s", filename, err)
             return
         try:
-            f.write(new_text)
+            f.write(_to_system_newlines(new_text))
         except os.error as err:
             self.log_error("Can't write %s: %s", filename, err)
         finally:
diff --git a/Lib/lib2to3/tests/data/crlf.py b/Lib/lib2to3/tests/data/crlf.py
new file mode 100644
index 0000000..dbe2d7b
--- /dev/null
+++ b/Lib/lib2to3/tests/data/crlf.py
@@ -0,0 +1,3 @@
+print "hi"
+
+print "Like bad Windows newlines?"
diff --git a/Lib/lib2to3/tests/data/different_encoding.py b/Lib/lib2to3/tests/data/different_encoding.py
new file mode 100644
index 0000000..4bb82bd
--- /dev/null
+++ b/Lib/lib2to3/tests/data/different_encoding.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- coding: iso-8859-1 -*-
+print(u'��������������������������������������������������������������')
+
diff --git a/Lib/lib2to3/tests/support.py b/Lib/lib2to3/tests/support.py
index 7abf2ef..8b8468c 100644
--- a/Lib/lib2to3/tests/support.py
+++ b/Lib/lib2to3/tests/support.py
@@ -9,12 +9,9 @@ import os.path
 import re
 from textwrap import dedent
 
-#sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-
 # Local imports
-from .. import pytree
-from .. import refactor
-from ..pgen2 import driver
+from lib2to3 import pytree, refactor
+from lib2to3.pgen2 import driver
 
 test_dir = os.path.dirname(__file__)
 proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
@@ -25,12 +22,6 @@ driver = driver.Driver(grammar, convert=pytree.convert)
 def parse_string(string):
     return driver.parse_string(reformat(string), debug=True)
 
-# Python 2.3's TestSuite is not iter()-able
-if sys.version_info < (2, 4):
-    def TestSuite_iter(self):
-        return iter(self._tests)
-    unittest.TestSuite.__iter__ = TestSuite_iter
-
 def run_all_tests(test_mod=None, tests=None):
     if tests is None:
         tests = unittest.TestLoader().loadTestsFromModule(test_mod)
diff --git a/Lib/lib2to3/tests/test_all_fixers.py b/Lib/lib2to3/tests/test_all_fixers.py
index 68d6306..1795ade 100644
--- a/Lib/lib2to3/tests/test_all_fixers.py
+++ b/Lib/lib2to3/tests/test_all_fixers.py
@@ -28,7 +28,7 @@ class Test_all(support.TestCase):
     def test_all_project_files(self):
         for filepath in support.all_project_files():
             print("Fixing %s..." % filepath)
-            self.refactor.refactor_string(open(filepath).read(), filepath)
+            self.refactor.refactor_file(filepath)
 
 
 if __name__ == "__main__":
diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py
index 83aa812..2aa737c 100644
--- a/Lib/lib2to3/tests/test_parser.py
+++ b/Lib/lib2to3/tests/test_parser.py
@@ -14,9 +14,9 @@ from .support import driver, test_dir
 
 # Python imports
 import os
-import os.path
 
 # Local imports
+from lib2to3.pgen2 import tokenize
 from ..pgen2.parse import ParseError
 
 
@@ -150,13 +150,25 @@ class TestParserIdempotency(support.TestCase):
     def test_all_project_files(self):
         for filepath in support.all_project_files():
             print("Parsing %s..." % filepath)
-            tree = driver.parse_file(filepath, debug=True)
-            if diff(filepath, tree):
+            with open(filepath, "rb") as fp:
+                encoding = tokenize.detect_encoding(fp.readline)[0]
+                fp.seek(0)
+                source = fp.read()
+                if encoding:
+                    source = source.decode(encoding)
+            tree = driver.parse_string(source)
+            new = str(tree)
+            if encoding:
+                new = new.encode(encoding)
+            if diff(filepath, new):
                 self.fail("Idempotency failed: %s" % filepath)
 
 
 class TestLiterals(GrammarTest):
 
+    def validate(self, s):
+        driver.parse_string(support.dedent(s) + "\n\n")
+
     def test_multiline_bytes_literals(self):
         s = """
             md5test(b"\xaa" * 80,
@@ -185,10 +197,10 @@ class TestLiterals(GrammarTest):
         self.validate(s)
 
 
-def diff(fn, tree):
+def diff(fn, result):
     f = open("@", "w")
     try:
-        f.write(str(tree))
+        f.write(result)
     finally:
         f.close()
     try:
diff --git a/Lib/lib2to3/tests/test_refactor.py b/Lib/lib2to3/tests/test_refactor.py
index 5a49f01..e55f555 100644
--- a/Lib/lib2to3/tests/test_refactor.py
+++ b/Lib/lib2to3/tests/test_refactor.py
@@ -14,7 +14,8 @@ from lib2to3 import refactor, pygram, fixer_base
 from . import support
 
 
-FIXER_DIR = os.path.join(os.path.dirname(__file__), "data/fixers")
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+FIXER_DIR = os.path.join(TEST_DATA_DIR, "fixers")
 
 sys.path.append(FIXER_DIR)
 try:
@@ -22,6 +23,8 @@ try:
 finally:
     sys.path.pop()
 
+_2TO3_FIXERS = refactor.get_fixers_from_package("lib2to3.fixes")
+
 class TestRefactoringTool(unittest.TestCase):
 
     def setUp(self):
@@ -121,19 +124,40 @@ class TestRefactoringTool(unittest.TestCase):
 +def cheese(): pass""".splitlines()
         self.assertEqual(diff_lines[:-1], expected)
 
+    def check_file_refactoring(self, test_file, fixers=_2TO3_FIXERS):
+        def read_file():
+            with open(test_file, "rb") as fp:
+                return fp.read()
+        old_contents = read_file()
+        rt = self.rt(fixers=fixers)
+
+        rt.refactor_file(test_file)
+        self.assertEqual(old_contents, read_file())
+
+        try:
+            rt.refactor_file(test_file, True)
+            self.assertNotEqual(old_contents, read_file())
+        finally:
+            with open(test_file, "wb") as fp:
+                fp.write(old_contents)
+
     def test_refactor_file(self):
         test_file = os.path.join(FIXER_DIR, "parrot_example.py")
-        old_contents = open(test_file, "r").read()
-        rt = self.rt()
+        self.check_file_refactoring(test_file, _DEFAULT_FIXERS)
 
-        rt.refactor_file(test_file)
-        self.assertEqual(old_contents, open(test_file, "r").read())
+    def test_file_encoding(self):
+        fn = os.path.join(TEST_DATA_DIR, "different_encoding.py")
+        self.check_file_refactoring(fn)
 
-        rt.refactor_file(test_file, True)
+    def test_crlf_newlines(self):
+        old_sep = os.linesep
+        os.linesep = "\r\n"
         try:
-            self.assertNotEqual(old_contents, open(test_file, "r").read())
+            fn = os.path.join(TEST_DATA_DIR, "crlf.py")
+            fixes = refactor.get_fixers_from_package("lib2to3.fixes")
+            self.check_file_refactoring(fn, fixes)
         finally:
-            open(test_file, "w").write(old_contents)
+            os.linesep = old_sep
 
     def test_refactor_docstring(self):
         rt = self.rt()
-- 
cgit v0.12