From 1f77a7001b2e3f8f9224cb603e5acfee45573064 Mon Sep 17 00:00:00 2001
From: Justin Borodinsky <justin.borodinsky@gmail.com>
Date: Sun, 11 Jan 2015 14:33:36 -0500
Subject: file: Teach STRINGS to support UTF-16 and UTF-32 encodings

---
 Help/command/file.rst                     |   5 ++-
 Help/release/dev/file-strings-utf-16.rst  |   5 +++
 Source/cmFileCommand.cxx                  |  64 ++++++++++++++++++++++++++++--
 Tests/RunCMake/string/RunCMakeTest.cmake  |   5 +++
 Tests/RunCMake/string/UTF-16BE-stderr.txt |   2 +
 Tests/RunCMake/string/UTF-16BE.cmake      |   4 ++
 Tests/RunCMake/string/UTF-16BE.txt        | Bin 0 -> 83 bytes
 Tests/RunCMake/string/UTF-16LE-stderr.txt |   2 +
 Tests/RunCMake/string/UTF-16LE.cmake      |   4 ++
 Tests/RunCMake/string/UTF-16LE.txt        | Bin 0 -> 83 bytes
 Tests/RunCMake/string/UTF-32BE-stderr.txt |   2 +
 Tests/RunCMake/string/UTF-32BE.cmake      |   4 ++
 Tests/RunCMake/string/UTF-32BE.txt        | Bin 0 -> 165 bytes
 Tests/RunCMake/string/UTF-32LE-stderr.txt |   2 +
 Tests/RunCMake/string/UTF-32LE.cmake      |   4 ++
 Tests/RunCMake/string/UTF-32LE.txt        | Bin 0 -> 165 bytes
 16 files changed, 99 insertions(+), 4 deletions(-)
 create mode 100644 Help/release/dev/file-strings-utf-16.rst
 create mode 100644 Tests/RunCMake/string/UTF-16BE-stderr.txt
 create mode 100644 Tests/RunCMake/string/UTF-16BE.cmake
 create mode 100644 Tests/RunCMake/string/UTF-16BE.txt
 create mode 100644 Tests/RunCMake/string/UTF-16LE-stderr.txt
 create mode 100644 Tests/RunCMake/string/UTF-16LE.cmake
 create mode 100644 Tests/RunCMake/string/UTF-16LE.txt
 create mode 100644 Tests/RunCMake/string/UTF-32BE-stderr.txt
 create mode 100644 Tests/RunCMake/string/UTF-32BE.cmake
 create mode 100644 Tests/RunCMake/string/UTF-32BE.txt
 create mode 100644 Tests/RunCMake/string/UTF-32LE-stderr.txt
 create mode 100644 Tests/RunCMake/string/UTF-32LE.cmake
 create mode 100644 Tests/RunCMake/string/UTF-32LE.txt

diff --git a/Help/command/file.rst b/Help/command/file.rst
index b0d4792..73d4cfa 100644
--- a/Help/command/file.rst
+++ b/Help/command/file.rst
@@ -65,7 +65,10 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
  Consider only strings that match the given regular expression.
 
 ``ENCODING <encoding-type>``
- Consider strings of a given encoding.  "UTF-8" is currently supported.
+ Consider strings of a given encoding.  Currently supported encodings are:
+ UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE.  If the ENCODING option
+ is not provided and the file has a Byte Order Mark, the ENCODING option
+ will be defaulted to respect the Byte Order Mark.
 
 For example, the code
 
diff --git a/Help/release/dev/file-strings-utf-16.rst b/Help/release/dev/file-strings-utf-16.rst
new file mode 100644
index 0000000..f40b63e
--- /dev/null
+++ b/Help/release/dev/file-strings-utf-16.rst
@@ -0,0 +1,5 @@
+file-strings-utf-16
+-------------------
+
+* The :command:`file(STRINGS)` now supports UTF-16LE, UTF-16BE,
+  UTF-32LE, UTF-32BE as ``ENCODING`` options.
diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx
index f125292..579e715 100644
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@@ -472,7 +472,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
   bool have_regex = false;
   bool newline_consume = false;
   bool hex_conversion_enabled = true;
-  bool utf8_encoding = false;
+  enum { encoding_none = cmsys::FStream::BOM_None,
+         encoding_utf8 = cmsys::FStream::BOM_UTF8,
+         encoding_utf16le = cmsys::FStream::BOM_UTF16LE,
+         encoding_utf16be = cmsys::FStream::BOM_UTF16BE,
+         encoding_utf32le = cmsys::FStream::BOM_UTF32LE,
+         encoding_utf32be = cmsys::FStream::BOM_UTF32BE};
+  int encoding = encoding_none;
   int arg_mode = arg_none;
   for(unsigned int i=3; i < args.size(); ++i)
     {
@@ -599,7 +605,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       {
       if(args[i] == "UTF-8")
         {
-        utf8_encoding = true;
+        encoding = encoding_utf8;
+        }
+      else if(args[i] == "UTF-16LE")
+        {
+        encoding = encoding_utf16le;
+        }
+      else if(args[i] == "UTF-16BE")
+        {
+        encoding = encoding_utf16be;
+        }
+      else if(args[i] == "UTF-32LE")
+        {
+        encoding = encoding_utf32le;
+        }
+      else if(args[i] == "UTF-32BE")
+        {
+        encoding = encoding_utf32be;
         }
       else
         {
@@ -647,6 +669,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
     return false;
     }
 
+  //If BOM is found and encoding was not specified, use the BOM
+  int bom_found = cmsys::FStream::ReadBOM(fin);
+  if(encoding == encoding_none && bom_found != cmsys::FStream::BOM_None)
+    {
+    encoding = bom_found;
+    }
+
+  unsigned int bytes_rem = 0;
+  if(encoding == encoding_utf16le || encoding == encoding_utf16be)
+    {
+    bytes_rem = 1;
+    }
+  if(encoding == encoding_utf32le || encoding == encoding_utf32be)
+    {
+    bytes_rem = 3;
+    }
+
   // Parse strings out of the file.
   int output_size = 0;
   std::vector<std::string> strings;
@@ -658,6 +697,25 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
     std::string current_str;
 
     int c = fin.get();
+    for(unsigned int i=0; i<bytes_rem; ++i)
+      {
+      int c1 = fin.get();
+      if(!fin)
+        {
+        fin.putback(static_cast<char>(c1));
+        break;
+        }
+      c = (c << 8) | c1;
+      }
+    if(encoding == encoding_utf16le)
+      {
+      c = ((c & 0xFF) << 8) | ((c & 0xFF00) >> 8);
+      }
+    else if(encoding == encoding_utf32le)
+      {
+       c = (((c & 0xFF) << 24) | ((c & 0xFF00) << 8) |
+          ((c & 0xFF0000) >> 8) | ((c & 0xFF000000) >> 24));
+      }
 
     if(c == '\r')
       {
@@ -673,7 +731,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       // c is guaranteed to fit in char by the above if...
       current_str += static_cast<char>(c);
       }
-    else if(utf8_encoding)
+    else if(encoding == encoding_utf8)
       {
       // Check for UTF-8 encoded string (up to 4 octets)
       static const unsigned char utf8_check_table[3][2] =
diff --git a/Tests/RunCMake/string/RunCMakeTest.cmake b/Tests/RunCMake/string/RunCMakeTest.cmake
index fc913c6..89f7ea5 100644
--- a/Tests/RunCMake/string/RunCMakeTest.cmake
+++ b/Tests/RunCMake/string/RunCMakeTest.cmake
@@ -12,3 +12,8 @@ run_cmake(UuidMissingTypeValue)
 run_cmake(UuidBadType)
 
 run_cmake(RegexClear)
+
+run_cmake(UTF-16BE)
+run_cmake(UTF-16LE)
+run_cmake(UTF-32BE)
+run_cmake(UTF-32LE)
diff --git a/Tests/RunCMake/string/UTF-16BE-stderr.txt b/Tests/RunCMake/string/UTF-16BE-stderr.txt
new file mode 100644
index 0000000..8254f87
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-16BE-stderr.txt
@@ -0,0 +1,2 @@
+Hello World
+Hello World
diff --git a/Tests/RunCMake/string/UTF-16BE.cmake b/Tests/RunCMake/string/UTF-16BE.cmake
new file mode 100644
index 0000000..da986c0
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-16BE.cmake
@@ -0,0 +1,4 @@
+file(STRINGS UTF-16BE.txt str ENCODING UTF-16BE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-16BE.txt str LENGTH_MINIMUM 4)
+message("${str}")
diff --git a/Tests/RunCMake/string/UTF-16BE.txt b/Tests/RunCMake/string/UTF-16BE.txt
new file mode 100644
index 0000000..9d976bc
Binary files /dev/null and b/Tests/RunCMake/string/UTF-16BE.txt differ
diff --git a/Tests/RunCMake/string/UTF-16LE-stderr.txt b/Tests/RunCMake/string/UTF-16LE-stderr.txt
new file mode 100644
index 0000000..8254f87
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-16LE-stderr.txt
@@ -0,0 +1,2 @@
+Hello World
+Hello World
diff --git a/Tests/RunCMake/string/UTF-16LE.cmake b/Tests/RunCMake/string/UTF-16LE.cmake
new file mode 100644
index 0000000..326d848
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-16LE.cmake
@@ -0,0 +1,4 @@
+file(STRINGS UTF-16LE.txt str ENCODING UTF-16LE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-16LE.txt str LENGTH_MINIMUM 4)
+message("${str}")
diff --git a/Tests/RunCMake/string/UTF-16LE.txt b/Tests/RunCMake/string/UTF-16LE.txt
new file mode 100644
index 0000000..ebba874
Binary files /dev/null and b/Tests/RunCMake/string/UTF-16LE.txt differ
diff --git a/Tests/RunCMake/string/UTF-32BE-stderr.txt b/Tests/RunCMake/string/UTF-32BE-stderr.txt
new file mode 100644
index 0000000..8254f87
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-32BE-stderr.txt
@@ -0,0 +1,2 @@
+Hello World
+Hello World
diff --git a/Tests/RunCMake/string/UTF-32BE.cmake b/Tests/RunCMake/string/UTF-32BE.cmake
new file mode 100644
index 0000000..debdeaa
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-32BE.cmake
@@ -0,0 +1,4 @@
+file(STRINGS UTF-32BE.txt str ENCODING UTF-32BE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-32BE.txt str LENGTH_MINIMUM 4)
+message("${str}")
diff --git a/Tests/RunCMake/string/UTF-32BE.txt b/Tests/RunCMake/string/UTF-32BE.txt
new file mode 100644
index 0000000..6725fbb
Binary files /dev/null and b/Tests/RunCMake/string/UTF-32BE.txt differ
diff --git a/Tests/RunCMake/string/UTF-32LE-stderr.txt b/Tests/RunCMake/string/UTF-32LE-stderr.txt
new file mode 100644
index 0000000..8254f87
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-32LE-stderr.txt
@@ -0,0 +1,2 @@
+Hello World
+Hello World
diff --git a/Tests/RunCMake/string/UTF-32LE.cmake b/Tests/RunCMake/string/UTF-32LE.cmake
new file mode 100644
index 0000000..22aab5f
--- /dev/null
+++ b/Tests/RunCMake/string/UTF-32LE.cmake
@@ -0,0 +1,4 @@
+file(STRINGS UTF-32LE.txt str ENCODING UTF-32LE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-32LE.txt str LENGTH_MINIMUM 4)
+message("${str}")
diff --git a/Tests/RunCMake/string/UTF-32LE.txt b/Tests/RunCMake/string/UTF-32LE.txt
new file mode 100644
index 0000000..cf5102f
Binary files /dev/null and b/Tests/RunCMake/string/UTF-32LE.txt differ
-- 
cgit v0.12