summaryrefslogtreecommitdiffstats
path: root/src/engine/SCons
diff options
context:
space:
mode:
authorSteven Knight <knight@baldmt.com>2009-03-02 19:11:09 (GMT)
committerSteven Knight <knight@baldmt.com>2009-03-02 19:11:09 (GMT)
commit0d621c8ecd14c47d09a4b42e527b639135d70fc8 (patch)
tree5ee6852eebcbda0dcb8ff137037c83c5dc377472 /src/engine/SCons
parent75d27c7dac1621ca3f8c26bfa31b93b77d56c2dd (diff)
downloadSCons-0d621c8ecd14c47d09a4b42e527b639135d70fc8.zip
SCons-0d621c8ecd14c47d09a4b42e527b639135d70fc8.tar.gz
SCons-0d621c8ecd14c47d09a4b42e527b639135d70fc8.tar.bz2
Fix handling of both UTF_16_LE and UTF_16_BE. Add an actual test for
scanning Unicode files for implicit dependencies. Clean up how we handle decoding, and wrap it for earlier Python versions.
Diffstat (limited to 'src/engine/SCons')
-rw-r--r--src/engine/SCons/Node/FS.py65
1 files changed, 58 insertions, 7 deletions
diff --git a/src/engine/SCons/Node/FS.py b/src/engine/SCons/Node/FS.py
index a4036ab..bd8314b 100644
--- a/src/engine/SCons/Node/FS.py
+++ b/src/engine/SCons/Node/FS.py
@@ -58,12 +58,46 @@ else:
except AttributeError:
codecs.BOM_UTF8 = '\xef\xbb\xbf'
try:
- codecs.BOM_UTF16
+ codecs.BOM_UTF16_LE
+ codecs.BOM_UTF16_BE
except AttributeError:
- if sys.byteorder == 'little':
- codecs.BOM_UTF16 = '\xff\xfe'
+ codecs.BOM_UTF16_LE = '\xff\xfe'
+ codecs.BOM_UTF16_BE = '\xfe\xff'
+
+ # Provide a wrapper function to handle decoding differences in
+ # different versions of Python. Normally, we'd try to do this in the
+ # compat layer (and maybe it still makes sense to move there?) but
+ # that doesn't provide a way to supply the string class used in
+ # pre-2.3 Python versions with a .decode() method that all strings
+ # naturally have. Plus, the 2.[01] encodings behave differently
+ # enough that we have to settle for a lowest-common-denominator
+ # wrapper approach.
+ #
+ # Note that the 2.[012] implementations below may be inefficient
+ # because they perform an explicit look up of the encoding for every
+ # decode, but they're old enough (and we want to stop supporting
+ # them soon enough) that it's not worth complicating the interface.
+ # Think of it as additional incentive for people to upgrade...
+ try:
+ ''.decode
+ except AttributeError:
+ # 2.0 through 2.2: strings have no .decode() method
+ try:
+ codecs.lookup('ascii').decode
+ except AttributeError:
+ # 2.0 and 2.1: encodings are a tuple of functions, and the
+ # decode() function returns a (result, length) tuple.
+ def my_decode(contents, encoding):
+ return codecs.lookup(encoding)[1](contents)[0]
else:
- codecs.BOM_UTF16 = '\xfe\xff'
+ # 2.2: encodings are an object with methods, and the
+ # .decode() method returns just the decoded bytes.
+ def my_decode(contents, encoding):
+ return codecs.lookup(encoding).decode(contents)
+ else:
+ # 2.3 or later: use the .decode() string method
+ def my_decode(contents, encoding):
+ return contents.decode(encoding)
import SCons.Action
from SCons.Debug import logInstanceCreation
@@ -2309,10 +2343,27 @@ class File(Base):
# it's a valid python string.
def get_text_contents(self):
contents = self.get_contents()
+ # The behavior of various decode() methods and functions
+ # w.r.t. the initial BOM bytes is different for different
+ # encodings and/or Python versions. ('utf-8' does not strip
+ # them, but has a 'utf-8-sig' which does; 'utf-16' seems to
+ # strip them; etc.) Just side step all the complication by
+ # explicitly stripping the BOM before we decode().
if contents.startswith(codecs.BOM_UTF8):
- contents = contents.decode('utf-8')
- elif contents.startswith(codecs.BOM_UTF16):
- contents = contents.decode('utf-16')
+ contents = contents[len(codecs.BOM_UTF8):]
+ # TODO(2.2): Remove when 2.3 becomes floor.
+ #contents = contents.decode('utf-8')
+ contents = my_decode(contents, 'utf-8')
+ elif contents.startswith(codecs.BOM_UTF16_LE):
+ contents = contents[len(codecs.BOM_UTF16_LE):]
+ # TODO(2.2): Remove when 2.3 becomes floor.
+ #contents = contents.decode('utf-16-le')
+ contents = my_decode(contents, 'utf-16-le')
+ elif contents.startswith(codecs.BOM_UTF16_BE):
+ contents = contents[len(codecs.BOM_UTF16_BE):]
+ # TODO(2.2): Remove when 2.3 becomes floor.
+ #contents = contents.decode('utf-16-be')
+ contents = my_decode(contents, 'utf-16-be')
return contents
def get_content_hash(self):