Issue 2255: Handle scanning of UTF-8 and UTF-16 files. (Greg Spencer)

author: Steven Knight <knight@baldmt.com> 2008-12-12 06:16:31 (GMT)
committer: Steven Knight <knight@baldmt.com> 2008-12-12 06:16:31 (GMT)
commit: 6554d8631debd788d6bad226d098daee080ca20e (patch)
tree: f468f63915d4ba1b0ea3cc2d064d5ce922cfe6e4 /src/engine/SCons/Node/FS.py
parent: 8e1e691178fb24207d0e073a8c67bd8810211396 (diff)
download: SCons-6554d8631debd788d6bad226d098daee080ca20e.zip
SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.gz
SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.bz2
1 files changed, 51 insertions, 10 deletions
diff --git a/src/engine/SCons/Node/FS.py b/src/engine/SCons/Node/FS.py
index 98efc7a..bdc1bfd 100644
--- a/src/engine/SCons/Node/FS.py
+++ b/src/engine/SCons/Node/FS.py
@@ -35,8 +35,9 @@ that can be used by scripts or modules looking for the canonical default.
 
 __revision__ = "__FILE__ __REVISION__ __DATE__ __DEVELOPER__"
 
-import fnmatch
 from itertools import izip
+import cStringIO
+import fnmatch
 import os
 import os.path
 import re
@@ -45,7 +46,11 @@ import stat
 import string
 import sys
 import time
-import cStringIO
+
+try:
+    import codecs
+except ImportError:
+    pass
 
 import SCons.Action
 from SCons.Debug import logInstanceCreation
@@ -876,11 +881,8 @@ class Entry(Base):
         return self.get_suffix()
 
     def get_contents(self):
-        """Fetch the contents of the entry.
-
-        Since this should return the real contents from the file
-        system, we check to see into what sort of subclass we should
-        morph this Entry."""
+        """Fetch the contents of the entry.  Returns the exact binary
+        contents of the file."""
         try:
             self = self.disambiguate(must_exist=1)
         except SCons.Errors.UserError:
@@ -893,6 +895,24 @@ class Entry(Base):
         else:
             return self.get_contents()
 
+    def get_text_contents(self):
+        """Fetch the decoded text contents of a Unicode encoded Entry.
+
+        Since this should return the text contents from the file
+        system, we check to see into what sort of subclass we should
+        morph this Entry."""
+        try:
+            self = self.disambiguate(must_exist=1)
+        except SCons.Errors.UserError:
+            # There was nothing on disk with which to disambiguate
+            # this entry.  Leave it as an Entry, but return a null
+            # string so calls to get_text_contents() in emitters and
+            # the like (e.g. in qt.py) don't have to disambiguate by
+            # hand or catch the exception.
+            return ''
+        else:
+            return self.get_text_contents()
+
     def must_be_same(self, klass):
         """Called to make sure a Node is a Dir.  Since we're an
         Entry, we can morph into one."""
@@ -1598,13 +1618,18 @@ class Dir(Base):
         """A directory does not get scanned."""
         return None
 
+    def get_text_contents(self):
+        """We already emit things in text, so just return the binary
+        version."""
+        return self.get_contents()
+
     def get_contents(self):
         """Return content signatures and names of all our children
         separated by new-lines. Ensure that the nodes are sorted."""
         contents = []
         name_cmp = lambda a, b: cmp(a.name, b.name)
         sorted_children = self.children()[:]
-        sorted_children.sort(name_cmp)        
+        sorted_children.sort(name_cmp)
         for node in sorted_children:
             contents.append('%s %s\n' % (node.get_csig(), node.name))
         return string.join(contents, '')
@@ -2236,12 +2261,28 @@ class File(Base):
             return ''
         fname = self.rfile().abspath
         try:
-            r = open(fname, "rb").read()
+            contents = open(fname, "rb").read()
         except EnvironmentError, e:
             if not e.filename:
                 e.filename = fname
             raise
-        return r
+        return contents
+
+    try:
+        import codecs
+    except ImportError:
+        get_text_contents = get_contents
+    else:
+        # This attempts to figure out what the encoding of the text is
+        # based upon the BOM bytes, and then decodes the contents so that
+        # it's a valid python string.
+        def get_text_contents(self):
+            contents = self.get_contents()
+            if contents.startswith(codecs.BOM_UTF8):
+                contents = contents.decode('utf-8')
+            elif contents.startswith(codecs.BOM_UTF16):
+                contents = contents.decode('utf-16')
+            return contents
 
     def get_content_hash(self):
         """
author	Steven Knight <knight@baldmt.com>	2008-12-12 06:16:31 (GMT)
committer	Steven Knight <knight@baldmt.com>	2008-12-12 06:16:31 (GMT)
commit	6554d8631debd788d6bad226d098daee080ca20e (patch)
tree	f468f63915d4ba1b0ea3cc2d064d5ce922cfe6e4 /src/engine/SCons/Node/FS.py
parent	8e1e691178fb24207d0e073a8c67bd8810211396 (diff)
download	SCons-6554d8631debd788d6bad226d098daee080ca20e.zip SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.gz SCons-6554d8631debd788d6bad226d098daee080ca20e.tar.bz2