summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorClinton Stimpson <clinton@elemtech.com>2014-08-04 16:47:22 (GMT)
committerBrad King <brad.king@kitware.com>2014-08-06 13:23:47 (GMT)
commit5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf (patch)
treef65b3c668a8794db8419861839d705400a88e116
parentffa373e71114727dd70f1a051414de573debb767 (diff)
downloadCMake-5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf.zip
CMake-5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf.tar.gz
CMake-5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf.tar.bz2
file: Add ENCODING option to file(STRINGS) command (#10519)
Support extraction of UTF-8 strings.
-rw-r--r--Help/command/file.rst3
-rw-r--r--Help/release/dev/file-strings-encoding.rst5
-rw-r--r--Source/cmFileCommand.cxx70
-rw-r--r--Tests/StringFileTest/CMakeLists.txt10
-rw-r--r--Tests/StringFileTest/test.utf83
5 files changed, 90 insertions, 1 deletions
diff --git a/Help/command/file.rst b/Help/command/file.rst
index 58e3a26..dbc4149 100644
--- a/Help/command/file.rst
+++ b/Help/command/file.rst
@@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
``REGEX <regex>``
Consider only strings that match the given regular expression.
+``ENCODING <encoding-type>``
+ Consider strings of a given encoding. "UTF-8" is currently supported.
+
For example, the code
.. code-block:: cmake
diff --git a/Help/release/dev/file-strings-encoding.rst b/Help/release/dev/file-strings-encoding.rst
new file mode 100644
index 0000000..9da3e47
--- /dev/null
+++ b/Help/release/dev/file-strings-encoding.rst
@@ -0,0 +1,5 @@
+file-strings-encoding
+---------------------
+
+* The :command:`file(STRINGS)` command gained a new ``ENCODING``
+ option to enable extraction of ``UTF-8`` strings.
diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx
index b99b1c7..1325cec 100644
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
arg_length_minimum,
arg_length_maximum,
arg__maximum,
- arg_regex };
+ arg_regex,
+ arg_encoding };
unsigned int minlen = 0;
unsigned int maxlen = 0;
int limit_input = -1;
@@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
bool have_regex = false;
bool newline_consume = false;
bool hex_conversion_enabled = true;
+ bool utf8_encoding = false;
int arg_mode = arg_none;
for(unsigned int i=3; i < args.size(); ++i)
{
@@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
hex_conversion_enabled = false;
arg_mode = arg_none;
}
+ else if(args[i] == "ENCODING")
+ {
+ arg_mode = arg_encoding;
+ }
else if(arg_mode == arg_limit_input)
{
if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
@@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
have_regex = true;
arg_mode = arg_none;
}
+ else if(arg_mode == arg_encoding)
+ {
+ if(args[i] == "UTF-8")
+ {
+ utf8_encoding = true;
+ }
+ else
+ {
+ cmOStringStream e;
+ e << "STRINGS option ENCODING \""
+ << args[i] << "\" not recognized.";
+ this->SetError(e.str());
+ return false;
+ }
+ arg_mode = arg_none;
+ }
else
{
cmOStringStream e;
@@ -618,6 +640,52 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// c is guaranteed to fit in char by the above if...
current_str += static_cast<char>(c);
}
+ else if(utf8_encoding)
+ {
+ // Check for UTF-8 encoded string (up to 4 octets)
+ static const unsigned char utf8_check_table[3][2] =
+ {
+ {0xE0, 0xC0},
+ {0xF0, 0xE0},
+ {0xF8, 0xF0},
+ };
+
+ // how many octets are there?
+ unsigned int num_utf8_bytes = 0;
+ for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
+ {
+ if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
+ num_utf8_bytes = j+2;
+ }
+
+ // get subsequent octets and check that they are valid
+ for(unsigned int j=0; j<num_utf8_bytes; j++)
+ {
+ if(j != 0)
+ {
+ c = fin.get();
+ if(!fin || (c & 0xC0) != 0x80)
+ {
+ fin.putback(static_cast<char>(c));
+ break;
+ }
+ }
+ current_str += static_cast<char>(c);
+ }
+
+ // if this was an invalid utf8 sequence, discard the data, and put
+ // back subsequent characters
+ if((current_str.length() != num_utf8_bytes))
+ {
+ for(unsigned int j=0; j<current_str.size()-1; j++)
+ {
+ c = current_str[current_str.size() - 1 - j];
+ fin.putback(static_cast<char>(c));
+ }
+ current_str = "";
+ }
+ }
+
if(c == '\n' && !newline_consume)
{
diff --git a/Tests/StringFileTest/CMakeLists.txt b/Tests/StringFileTest/CMakeLists.txt
index 4fa5a86..683f969 100644
--- a/Tests/StringFileTest/CMakeLists.txt
+++ b/Tests/StringFileTest/CMakeLists.txt
@@ -55,6 +55,16 @@ else()
"file(STRINGS) incorrectly read from srec file [${infile_strings}]")
endif()
+#this file has utf-8 content
+file(STRINGS test.utf8 infile_strings ENCODING UTF-8)
+list(LENGTH infile_strings content_len)
+if(content_len MATCHES "3")
+ message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
+else()
+ message(SEND_ERROR
+ "file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
+endif()
+
# String test
string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
diff --git a/Tests/StringFileTest/test.utf8 b/Tests/StringFileTest/test.utf8
new file mode 100644
index 0000000..6c29170
--- /dev/null
+++ b/Tests/StringFileTest/test.utf8
@@ -0,0 +1,3 @@
+The value of π (pi) is 3.141593
+Line mixed with binary partially matches valid utf8: Ï€ is à93.1593
+à \ No newline at end of file