Issue 2255: Handle scanning of UTF-8 and UTF-16 files. (Greg Spencer)

author: Steven Knight <knight@baldmt.com> 2008-12-12 06:16:31 (GMT)
committer: Steven Knight <knight@baldmt.com> 2008-12-12 06:16:31 (GMT)
commit: 6554d8631debd788d6bad226d098daee080ca20e (patch)
tree: f468f63915d4ba1b0ea3cc2d064d5ce922cfe6e4 /src/engine/SCons
parent: 8e1e691178fb24207d0e073a8c67bd8810211396 (diff)
download: SCons-6554d8631debd788d6bad226d098daee080ca20e.zip
SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.gz
SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.bz2
13 files changed, 155 insertions, 31 deletions
diff --git a/src/engine/SCons/Environment.py b/src/engine/SCons/Environment.py
index f6123b9..2304f2f 100644
--- a/src/engine/SCons/Environment.py
+++ b/src/engine/SCons/Environment.py
@@ -429,7 +429,7 @@ class SubstitutionEnvironment:
             self._dict[key] = value
 
     def get(self, key, default=None):
-        "Emulates the get() method of dictionaries."""
+        """Emulates the get() method of dictionaries."""
         return self._dict.get(key, default)
 
     def has_key(self, key):
diff --git a/src/engine/SCons/Node/FS.py b/src/engine/SCons/Node/FS.py
index 98efc7a..bdc1bfd 100644
--- a/src/engine/SCons/Node/FS.py
+++ b/src/engine/SCons/Node/FS.py
@@ -35,8 +35,9 @@ that can be used by scripts or modules looking for the canonical default.
 
 __revision__ = "__FILE__ __REVISION__ __DATE__ __DEVELOPER__"
 
-import fnmatch
 from itertools import izip
+import cStringIO
+import fnmatch
 import os
 import os.path
 import re
@@ -45,7 +46,11 @@ import stat
 import string
 import sys
 import time
-import cStringIO
+
+try:
+    import codecs
+except ImportError:
+    pass
 
 import SCons.Action
 from SCons.Debug import logInstanceCreation
@@ -876,11 +881,8 @@ class Entry(Base):
         return self.get_suffix()
 
     def get_contents(self):
-        """Fetch the contents of the entry.
-
-        Since this should return the real contents from the file
-        system, we check to see into what sort of subclass we should
-        morph this Entry."""
+        """Fetch the contents of the entry.  Returns the exact binary
+        contents of the file."""
         try:
             self = self.disambiguate(must_exist=1)
         except SCons.Errors.UserError:
@@ -893,6 +895,24 @@ class Entry(Base):
         else:
             return self.get_contents()
 
+    def get_text_contents(self):
+        """Fetch the decoded text contents of a Unicode encoded Entry.
+
+        Since this should return the text contents from the file
+        system, we check to see into what sort of subclass we should
+        morph this Entry."""
+        try:
+            self = self.disambiguate(must_exist=1)
+        except SCons.Errors.UserError:
+            # There was nothing on disk with which to disambiguate
+            # this entry.  Leave it as an Entry, but return a null
+            # string so calls to get_text_contents() in emitters and
+            # the like (e.g. in qt.py) don't have to disambiguate by
+            # hand or catch the exception.
+            return ''
+        else:
+            return self.get_text_contents()
+
     def must_be_same(self, klass):
         """Called to make sure a Node is a Dir.  Since we're an
         Entry, we can morph into one."""
@@ -1598,13 +1618,18 @@ class Dir(Base):
         """A directory does not get scanned."""
         return None
 
+    def get_text_contents(self):
+        """We already emit things in text, so just return the binary
+        version."""
+        return self.get_contents()
+
     def get_contents(self):
         """Return content signatures and names of all our children
         separated by new-lines. Ensure that the nodes are sorted."""
         contents = []
         name_cmp = lambda a, b: cmp(a.name, b.name)
         sorted_children = self.children()[:]
-        sorted_children.sort(name_cmp)        
+        sorted_children.sort(name_cmp)
         for node in sorted_children:
             contents.append('%s %s\n' % (node.get_csig(), node.name))
         return string.join(contents, '')
@@ -2236,12 +2261,28 @@ class File(Base):
             return ''
         fname = self.rfile().abspath
         try:
-            r = open(fname, "rb").read()
+            contents = open(fname, "rb").read()
         except EnvironmentError, e:
             if not e.filename:
                 e.filename = fname
             raise
-        return r
+        return contents
+
+    try:
+        import codecs
+    except ImportError:
+        get_text_contents = get_contents
+    else:
+        # This attempts to figure out what the encoding of the text is
+        # based upon the BOM bytes, and then decodes the contents so that
+        # it's a valid python string.
+        def get_text_contents(self):
+            contents = self.get_contents()
+            if contents.startswith(codecs.BOM_UTF8):
+                contents = contents.decode('utf-8')
+            elif contents.startswith(codecs.BOM_UTF16):
+                contents = contents.decode('utf-16')
+            return contents
 
     def get_content_hash(self):
         """
diff --git a/src/engine/SCons/Node/FSTests.py b/src/engine/SCons/Node/FSTests.py
index bf6a300..424aa5e 100644
--- a/src/engine/SCons/Node/FSTests.py
+++ b/src/engine/SCons/Node/FSTests.py
@@ -1192,6 +1192,18 @@ class FSTestCase(_tempdirTestCase):
         f1 = fs.File(test.workpath("binary_file"))
         assert f1.get_contents() == "Foo\x1aBar", f1.get_contents()
 
+        try:
+            # TODO(1.5)
+            eval('test_string = u"Foo\x1aBar"')
+        except SyntaxError:
+            pass
+        else:
+            # This tests to make sure we can decode UTF-8 text files.
+            test.write("utf8_file", test_string.encode('utf-8'))
+            f1 = fs.File(test.workpath("utf8_file"))
+            assert eval('f1.get_text_contents() == u"Foo\x1aBar"'), \
+                   f1.get_text_contents()
+
         def nonexistent(method, s):
             try:
                 x = method(s, create = 0)
@@ -1257,18 +1269,44 @@ class FSTestCase(_tempdirTestCase):
         finally:
             test.unlink("file")
 
+        # test Entry.get_text_contents()
+        e = fs.Entry('does_not_exist')
+        c = e.get_text_contents()
+        assert c == "", c
+        assert e.__class__ == SCons.Node.FS.Entry
+
+        test.write("file", "file\n")
+        try:
+            e = fs.Entry('file')
+            c = e.get_text_contents()
+            assert c == "file\n", c
+            assert e.__class__ == SCons.Node.FS.File
+        finally:
+            test.unlink("file")
+
         test.subdir("dir")
         e = fs.Entry('dir')
         c = e.get_contents()
         assert c == "", c
         assert e.__class__ == SCons.Node.FS.Dir
 
+        c = e.get_text_contents()
+        try:
+            eval('assert c == u"", c')
+        except SyntaxError:
+            assert c == ""
+
         if hasattr(os, 'symlink'):
             os.symlink('nonexistent', test.workpath('dangling_symlink'))
             e = fs.Entry('dangling_symlink')
             c = e.get_contents()
             assert e.__class__ == SCons.Node.FS.Entry, e.__class__
             assert c == "", c
+            c = e.get_text_contents()
+            try:
+                eval('assert c == u"", c')
+            except SyntaxError:
+                assert c == "", c
 
         test.write("tstamp", "tstamp\n")
         try:
@@ -1712,6 +1750,7 @@ class DirTestCase(_tempdirTestCase):
         files = string.split(d.get_contents(), '\n')
 
         assert e.get_contents() == '', e.get_contents()
+        assert e.get_text_contents() == '', e.get_text_contents()
         assert e.get_csig()+" empty" == files[0], files
         assert f.get_csig()+" f" == files[1], files
         assert g.get_csig()+" g" == files[2], files
@@ -2758,6 +2797,48 @@ class RepositoryTestCase(_tempdirTestCase):
         finally:
             test.unlink(["rep3", "contents"])
 
+    def test_get_text_contents(self):
+        """Ensure get_text_contents() returns text contents from
+        Repositories"""
+        fs = self.fs
+        test = self.test
+
+        # Use a test string that has a file terminator in it to make
+        # sure we read the entire file, regardless of its contents.
+        try:
+            eval('test_string = u"Con\x1aTents\n"')
+        except SyntaxError:
+            import UserString
+            class FakeUnicodeString(UserString.UserString):
+                def encode(self, encoding):
+                    return str(self)
+            test_string = FakeUnicodeString("Con\x1aTents\n")
+
+
+        # Test with ASCII.
+        test.write(["rep3", "contents"], test_string.encode('ascii'))
+        try:
+            c = fs.File("contents").get_text_contents()
+            assert test_string == c, "got %s" % repr(c)
+        finally:
+            test.unlink(["rep3", "contents"])
+
+        # Test with utf-8
+        test.write(["rep3", "contents"], test_string.encode('utf-8'))
+        try:
+            c = fs.File("contents").get_text_contents()
+            assert test_string == c, "got %s" % repr(c)
+        finally:
+            test.unlink(["rep3", "contents"])
+
+        # Test with utf-16
+        test.write(["rep3", "contents"], test_string.encode('utf-16'))
+        try:
+            c = fs.File("contents").get_text_contents()
+            assert test_string == c, "got %s" % repr(c)
+        finally:
+            test.unlink(["rep3", "contents"])
+
     #def test_is_up_to_date(self):
 
 
diff --git a/src/engine/SCons/SConfTests.py b/src/engine/SCons/SConfTests.py
index cef3889..9974485 100644
--- a/src/engine/SCons/SConfTests.py
+++ b/src/engine/SCons/SConfTests.py
@@ -335,7 +335,7 @@ int main() {
         self.scons_env[comp] = oldcomp
         self.scons_env['%sFLAGS' % comp] = 'qwertyuiop'
         r = func()
-        assert not r, "%s worked with %sFLAGS = qwertyuiop ?" % name
+        assert not r, "%s worked with %sFLAGS = qwertyuiop ?" % (name, comp)
 
     def test_CheckCC(self):
         """Test SConf.CheckCC()
diff --git a/src/engine/SCons/Scanner/D.py b/src/engine/SCons/Scanner/D.py
index bfbcd5d..dc3478a 100644
--- a/src/engine/SCons/Scanner/D.py
+++ b/src/engine/SCons/Scanner/D.py
@@ -63,6 +63,6 @@ class D(SCons.Scanner.Classic):
 
     def find_include_names(self, node):
         includes = []
-        for i in self.cre.findall(node.get_contents()):
+        for i in self.cre.findall(node.get_text_contents()):
             includes = includes + self.cre2.findall(i)
         return includes
diff --git a/src/engine/SCons/Scanner/Fortran.py b/src/engine/SCons/Scanner/Fortran.py
index 31a1e16..d2358ba 100644
--- a/src/engine/SCons/Scanner/Fortran.py
+++ b/src/engine/SCons/Scanner/Fortran.py
@@ -84,11 +84,11 @@ class F90Scanner(SCons.Scanner.Classic):
             mods_and_includes = node.includes
         else:
             # retrieve all included filenames
-            includes = self.cre_incl.findall(node.get_contents())
+            includes = self.cre_incl.findall(node.get_text_contents())
             # retrieve all USE'd module names
-            modules = self.cre_use.findall(node.get_contents())
+            modules = self.cre_use.findall(node.get_text_contents())
             # retrieve all defined module names
-            defmodules = self.cre_def.findall(node.get_contents())
+            defmodules = self.cre_def.findall(node.get_text_contents())
 
             # Remove all USE'd module names that are defined in the same file
             d = {}
diff --git a/src/engine/SCons/Scanner/LaTeX.py b/src/engine/SCons/Scanner/LaTeX.py
index c499ea5..db7f555 100644
--- a/src/engine/SCons/Scanner/LaTeX.py
+++ b/src/engine/SCons/Scanner/LaTeX.py
@@ -285,7 +285,7 @@ class LaTeX(SCons.Scanner.Base):
         if node.includes != None:
             includes = node.includes
         else:
-            includes = self.cre.findall(node.get_contents())
+            includes = self.cre.findall(node.get_text_contents())
             # 1. Split comma-separated lines, e.g.
             #      ('bibliography', 'phys,comp')
             #    should become two entries
diff --git a/src/engine/SCons/Scanner/ScannerTests.py b/src/engine/SCons/Scanner/ScannerTests.py
index 6e9286a..f6750dc 100644
--- a/src/engine/SCons/Scanner/ScannerTests.py
+++ b/src/engine/SCons/Scanner/ScannerTests.py
@@ -481,6 +481,8 @@ class ClassicTestCase(unittest.TestCase):
                 return self._exists
             def get_contents(self):
                 return self._contents
+            def get_text_contents(self):
+                return self._contents
             def get_dir(self):
                 return self._dir
 
diff --git a/src/engine/SCons/Scanner/__init__.py b/src/engine/SCons/Scanner/__init__.py
index 924b271..2d53cad 100644
--- a/src/engine/SCons/Scanner/__init__.py
+++ b/src/engine/SCons/Scanner/__init__.py
@@ -347,7 +347,7 @@ class Classic(Current):
         return SCons.Node.FS._my_normcase(include)
 
     def find_include_names(self, node):
-        return self.cre.findall(node.get_contents())
+        return self.cre.findall(node.get_text_contents())
 
     def scan(self, node, path=()):
 
diff --git a/src/engine/SCons/Tool/FortranCommon.py b/src/engine/SCons/Tool/FortranCommon.py
index 825cbe5..bf32ffa 100644
--- a/src/engine/SCons/Tool/FortranCommon.py
+++ b/src/engine/SCons/Tool/FortranCommon.py
@@ -67,7 +67,7 @@ def _fortranEmitter(target, source, env):
     mod_regex = """(?i)^\s*MODULE\s+(?!PROCEDURE)(\w+)"""
     cre = re.compile(mod_regex,re.M)
     # Retrieve all USE'd module names
-    modules = cre.findall(node.get_contents())
+    modules = cre.findall(node.get_text_contents())
     # Remove unique items from the list
     modules = SCons.Util.unique(modules)
     # Convert module name to a .mod filename
diff --git a/src/engine/SCons/Tool/jar.py b/src/engine/SCons/Tool/jar.py
index 6594ecc..7018c37 100644
--- a/src/engine/SCons/Tool/jar.py
+++ b/src/engine/SCons/Tool/jar.py
@@ -49,7 +49,7 @@ def jarSources(target, source, env, for_signature):
             jarchdir = env.fs.Dir(jarchdir)
     result = []
     for src in source:
-        contents = src.get_contents()
+        contents = src.get_text_contents()
         if contents[:16] != "Manifest-Version":
             if jarchdir_set:
                 _chdir = jarchdir
@@ -70,7 +70,7 @@ def jarSources(target, source, env, for_signature):
 def jarManifest(target, source, env, for_signature):
     """Look in sources for a manifest file, if any."""
     for src in source:
-        contents = src.get_contents()
+        contents = src.get_text_contents()
         if contents[:16] == "Manifest-Version":
             return src
     return ''
@@ -80,7 +80,7 @@ def jarFlags(target, source, env, for_signature):
     flag is specified."""
     jarflags = env.subst('$JARFLAGS', target=target, source=source)
     for src in source:
-        contents = src.get_contents()
+        contents = src.get_text_contents()
         if contents[:16] == "Manifest-Version":
             if not 'm' in jarflags:
                 return jarflags + 'm'
diff --git a/src/engine/SCons/Tool/qt.py b/src/engine/SCons/Tool/qt.py
index d67cddb..e2aa441 100644
--- a/src/engine/SCons/Tool/qt.py
+++ b/src/engine/SCons/Tool/qt.py
@@ -138,8 +138,8 @@ class _Automoc:
                     print "scons: qt: '%s' is no cxx file. Discarded." % str(cpp) 
                 # c or fortran source
                 continue
-            #cpp_contents = comment.sub('', cpp.get_contents())
-            cpp_contents = cpp.get_contents()
+            #cpp_contents = comment.sub('', cpp.get_text_contents())
+            cpp_contents = cpp.get_text_contents()
             h=None
             for h_ext in header_extensions:
                 # try to find the header file in the corresponding source
@@ -149,8 +149,8 @@ class _Automoc:
                 if h:
                     if debug:
                         print "scons: qt: Scanning '%s' (header of '%s')" % (str(h), str(cpp))
-                    #h_contents = comment.sub('', h.get_contents())
-                    h_contents = h.get_contents()
+                    #h_contents = comment.sub('', h.get_text_contents())
+                    h_contents = h.get_text_contents()
                     break
             if not h and debug:
                 print "scons: qt: no header for '%s'." % (str(cpp))
@@ -221,7 +221,7 @@ def uicScannerFunc(node, env, path):
     lookout = []
     lookout.extend(env['CPPPATH'])
     lookout.append(str(node.rfile().dir))
-    includes = re.findall("<include.*?>(.*?)</include>", node.get_contents())
+    includes = re.findall("<include.*?>(.*?)</include>", node.get_text_contents())
     result = []
     for incFile in includes:
         dep = env.FindFile(incFile,lookout)
diff --git a/src/engine/SCons/Tool/tex.py b/src/engine/SCons/Tool/tex.py
index 49da3d0..15e2d3e 100644
--- a/src/engine/SCons/Tool/tex.py
+++ b/src/engine/SCons/Tool/tex.py
@@ -198,7 +198,7 @@ def InternalLaTeXAuxAction(XXXLaTeXAction, target = None, source= None, env=None
     # we have to run makeindex at least once to keep the build
     # happy even if there is no index.
     # Same for glossaries and nomenclature
-    src_content = source[0].get_contents()
+    src_content = source[0].get_text_contents()
     run_makeindex = makeindex_re.search(src_content) and not os.path.exists(targetbase + '.idx')
     run_nomenclature = makenomenclature_re.search(src_content) and not os.path.exists(targetbase + '.nlo')
     run_glossary = makeglossary_re.search(src_content) and not os.path.exists(targetbase + '.glo')
@@ -373,7 +373,7 @@ LaTeX_re = re.compile("\\\\document(style|class)")
 def is_LaTeX(flist):
     # Scan a file list to decide if it's TeX- or LaTeX-flavored.
     for f in flist:
-        content = f.get_contents()
+        content = f.get_text_contents()
         if LaTeX_re.search(content):
             return 1
     return 0
@@ -422,7 +422,7 @@ def tex_pdf_emitter(target, source, env):
 def ScanFiles(theFile, target, paths, file_tests, file_tests_search, env, graphics_extensions, targetdir):
     # for theFile (a Node) update any file_tests and search for graphics files
     # then find all included files and call ScanFiles for each of them
-    content = theFile.get_contents()
+    content = theFile.get_text_contents()
     if Verbose:
         print " scanning ",str(theFile)
 
@@ -498,7 +498,7 @@ def tex_emitter_core(target, source, env, graphics_extensions):
     env.Clean(target[0],auxfilename)
     env.Clean(target[0],logfilename)
 
-    content = source[0].get_contents()
+    content = source[0].get_text_contents()
 
     idx_exists = os.path.exists(targetbase + '.idx')
     nlo_exists = os.path.exists(targetbase + '.nlo')
author	Steven Knight <knight@baldmt.com>	2008-12-12 06:16:31 (GMT)
committer	Steven Knight <knight@baldmt.com>	2008-12-12 06:16:31 (GMT)
commit	6554d8631debd788d6bad226d098daee080ca20e (patch)
tree	f468f63915d4ba1b0ea3cc2d064d5ce922cfe6e4 /src/engine/SCons
parent	8e1e691178fb24207d0e073a8c67bd8810211396 (diff)
download	SCons-6554d8631debd788d6bad226d098daee080ca20e.zip SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.gz SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.bz2