Merge topic 'file-strings-encoding'

5b30ec28 file: Add ENCODING option to file(STRINGS) command (#10519) ffa373e7 file: Refactor internal implementation of file(STRINGS)
author: Brad King <brad.king@kitware.com> 2014-08-06 13:26:28 (GMT)
committer: CMake Topic Stage <kwrobot@kitware.com> 2014-08-06 13:26:28 (GMT)
commit: 78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51 (patch)
tree: ca9e6862c432fb97d1d832060d152a6a9c289920
parent: 4ec6ff8f9f38d02e300b3fc6bf75c2743eb70f97 (diff)
parent: 5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf (diff)
download: CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.zip
CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.tar.gz
CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.tar.bz2
5 files changed, 118 insertions, 19 deletions
diff --git a/Help/command/file.rst b/Help/command/file.rst
index 58e3a26..dbc4149 100644
--- a/Help/command/file.rst
+++ b/Help/command/file.rst
@@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
 ``REGEX <regex>``
  Consider only strings that match the given regular expression.
 
+``ENCODING <encoding-type>``
+ Consider strings of a given encoding.  "UTF-8" is currently supported.
+
 For example, the code
 
 .. code-block:: cmake
diff --git a/Help/release/dev/file-strings-encoding.rst b/Help/release/dev/file-strings-encoding.rst
new file mode 100644
index 0000000..9da3e47
--- /dev/null
+++ b/Help/release/dev/file-strings-encoding.rst
@@ -0,0 +1,5 @@
+file-strings-encoding
+---------------------
+
+* The :command:`file(STRINGS)` command gained a new ``ENCODING``
+  option to enable extraction of ``UTF-8`` strings.
diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx
index e47365a..1325cec 100644
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
          arg_length_minimum,
          arg_length_maximum,
          arg__maximum,
-         arg_regex };
+         arg_regex,
+         arg_encoding };
   unsigned int minlen = 0;
   unsigned int maxlen = 0;
   int limit_input = -1;
@@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
   bool have_regex = false;
   bool newline_consume = false;
   bool hex_conversion_enabled = true;
+  bool utf8_encoding = false;
   int arg_mode = arg_none;
   for(unsigned int i=3; i < args.size(); ++i)
     {
@@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       hex_conversion_enabled = false;
       arg_mode = arg_none;
       }
+    else if(args[i] == "ENCODING")
+      {
+      arg_mode = arg_encoding;
+      }
     else if(arg_mode == arg_limit_input)
       {
       if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
@@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       have_regex = true;
       arg_mode = arg_none;
       }
+    else if(arg_mode == arg_encoding)
+      {
+      if(args[i] == "UTF-8")
+        {
+        utf8_encoding = true;
+        }
+      else
+        {
+        cmOStringStream e;
+        e << "STRINGS option ENCODING \""
+          << args[i] << "\" not recognized.";
+        this->SetError(e.str());
+        return false;
+        }
+      arg_mode = arg_none;
+      }
     else
       {
       cmOStringStream e;
@@ -596,11 +618,75 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
   int output_size = 0;
   std::vector<std::string> strings;
   std::string s;
-  int c;
   while((!limit_count || strings.size() < limit_count) &&
         (limit_input < 0 || static_cast<int>(fin.tellg()) < limit_input) &&
-        (c = fin.get(), fin))
+        fin)
     {
+    std::string current_str;
+
+    int c = fin.get();
+
+    if(c == '\r')
+      {
+      // Ignore CR character to make output always have UNIX newlines.
+      continue;
+      }
+
+    else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
+            (c == '\n' && newline_consume))
+      {
+      // This is an ASCII character that may be part of a string.
+      // Cast added to avoid compiler warning. Cast is ok because
+      // c is guaranteed to fit in char by the above if...
+      current_str += static_cast<char>(c);
+      }
+    else if(utf8_encoding)
+      {
+      // Check for UTF-8 encoded string (up to 4 octets)
+      static const unsigned char utf8_check_table[3][2] =
+        {
+          {0xE0, 0xC0},
+          {0xF0, 0xE0},
+          {0xF8, 0xF0},
+        };
+
+      // how many octets are there?
+      unsigned int num_utf8_bytes = 0;
+      for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
+        {
+        if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
+          num_utf8_bytes = j+2;
+        }
+
+      // get subsequent octets and check that they are valid
+      for(unsigned int j=0; j<num_utf8_bytes; j++)
+        {
+        if(j != 0)
+          {
+          c = fin.get();
+          if(!fin || (c & 0xC0) != 0x80)
+            {
+            fin.putback(static_cast<char>(c));
+            break;
+            }
+          }
+        current_str += static_cast<char>(c);
+        }
+
+      // if this was an invalid utf8 sequence, discard the data, and put
+      // back subsequent characters
+      if((current_str.length() != num_utf8_bytes))
+        {
+        for(unsigned int j=0; j<current_str.size()-1; j++)
+          {
+          c = current_str[current_str.size() - 1 - j];
+          fin.putback(static_cast<char>(c));
+          }
+        current_str = "";
+        }
+      }
+
+
     if(c == '\n' && !newline_consume)
       {
       // The current line has been terminated.  Check if the current
@@ -621,26 +707,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       // Reset the string to empty.
       s = "";
       }
-    else if(c == '\r')
-      {
-      // Ignore CR character to make output always have UNIX newlines.
-      }
-    else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
-            (c == '\n' && newline_consume))
+    else if(current_str.empty())
       {
-      // This is an ASCII character that may be part of a string.
-      // Cast added to avoid compiler warning. Cast is ok because
-      // c is guaranteed to fit in char by the above if...
-      s += static_cast<char>(c);
-      }
-    else
-      {
-      // TODO: Support ENCODING option.  See issue #10519.
       // A non-string character has been found.  Check if the current
       // string matches the requirements.  We require that the length
       // be at least one no matter what the user specified.
       if(s.length() >= minlen && s.length() >= 1 &&
-         (!have_regex || regex.find(s.c_str())))
+      (!have_regex || regex.find(s.c_str())))
         {
         output_size += static_cast<int>(s.size()) + 1;
         if(limit_output >= 0 && output_size >= limit_output)
@@ -654,10 +727,15 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       // Reset the string to empty.
       s = "";
       }
+    else
+      {
+      s += current_str;
+      }
+
 
-    // Terminate a string if the maximum length is reached.
     if(maxlen > 0 && s.size() == maxlen)
       {
+      // Terminate a string if the maximum length is reached.
       if(s.length() >= minlen &&
          (!have_regex || regex.find(s.c_str())))
         {
diff --git a/Tests/StringFileTest/CMakeLists.txt b/Tests/StringFileTest/CMakeLists.txt
index 4fa5a86..683f969 100644
--- a/Tests/StringFileTest/CMakeLists.txt
+++ b/Tests/StringFileTest/CMakeLists.txt
@@ -55,6 +55,16 @@ else()
     "file(STRINGS) incorrectly read from srec file [${infile_strings}]")
 endif()
 
+#this file has utf-8 content
+file(STRINGS test.utf8 infile_strings ENCODING UTF-8)
+list(LENGTH infile_strings content_len)
+if(content_len MATCHES "3")
+  message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
+else()
+  message(SEND_ERROR
+    "file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
+endif()
+
 # String test
 string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
 string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
diff --git a/Tests/StringFileTest/test.utf8 b/Tests/StringFileTest/test.utf8
new file mode 100644
index 0000000..6c29170
--- /dev/null
+++ b/Tests/StringFileTest/test.utf8
@@ -0,0 +1,3 @@
+The value of π (pi) is 3.141593
+Line mixed with binary partially matches valid utf8: π is �93.1593
+�
+\ No newline at end of file
author	Brad King <brad.king@kitware.com>	2014-08-06 13:26:28 (GMT)
committer	CMake Topic Stage <kwrobot@kitware.com>	2014-08-06 13:26:28 (GMT)
commit	78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51 (patch)
tree	ca9e6862c432fb97d1d832060d152a6a9c289920
parent	4ec6ff8f9f38d02e300b3fc6bf75c2743eb70f97 (diff)
parent	5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf (diff)
download	CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.zip CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.tar.gz CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.tar.bz2