diff options
author | Brad King <brad.king@kitware.com> | 2014-08-06 13:26:28 (GMT) |
---|---|---|
committer | CMake Topic Stage <kwrobot@kitware.com> | 2014-08-06 13:26:28 (GMT) |
commit | 78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51 (patch) | |
tree | ca9e6862c432fb97d1d832060d152a6a9c289920 | |
parent | 4ec6ff8f9f38d02e300b3fc6bf75c2743eb70f97 (diff) | |
parent | 5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf (diff) | |
download | CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.zip CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.tar.gz CMake-78efe8d4fd95feba45c41c6ecf9cdde0bfb19e51.tar.bz2 |
Merge topic 'file-strings-encoding'
5b30ec28 file: Add ENCODING option to file(STRINGS) command (#10519)
ffa373e7 file: Refactor internal implementation of file(STRINGS)
-rw-r--r-- | Help/command/file.rst | 3 | ||||
-rw-r--r-- | Help/release/dev/file-strings-encoding.rst | 5 | ||||
-rw-r--r-- | Source/cmFileCommand.cxx | 116 | ||||
-rw-r--r-- | Tests/StringFileTest/CMakeLists.txt | 10 | ||||
-rw-r--r-- | Tests/StringFileTest/test.utf8 | 3 |
5 files changed, 118 insertions, 19 deletions
diff --git a/Help/command/file.rst b/Help/command/file.rst index 58e3a26..dbc4149 100644 --- a/Help/command/file.rst +++ b/Help/command/file.rst @@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in ``REGEX <regex>`` Consider only strings that match the given regular expression. +``ENCODING <encoding-type>`` + Consider strings of a given encoding. "UTF-8" is currently supported. + For example, the code .. code-block:: cmake diff --git a/Help/release/dev/file-strings-encoding.rst b/Help/release/dev/file-strings-encoding.rst new file mode 100644 index 0000000..9da3e47 --- /dev/null +++ b/Help/release/dev/file-strings-encoding.rst @@ -0,0 +1,5 @@ +file-strings-encoding +--------------------- + +* The :command:`file(STRINGS)` command gained a new ``ENCODING`` + option to enable extraction of ``UTF-8`` strings. diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx index e47365a..1325cec 100644 --- a/Source/cmFileCommand.cxx +++ b/Source/cmFileCommand.cxx @@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) arg_length_minimum, arg_length_maximum, arg__maximum, - arg_regex }; + arg_regex, + arg_encoding }; unsigned int minlen = 0; unsigned int maxlen = 0; int limit_input = -1; @@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) bool have_regex = false; bool newline_consume = false; bool hex_conversion_enabled = true; + bool utf8_encoding = false; int arg_mode = arg_none; for(unsigned int i=3; i < args.size(); ++i) { @@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) hex_conversion_enabled = false; arg_mode = arg_none; } + else if(args[i] == "ENCODING") + { + arg_mode = arg_encoding; + } else if(arg_mode == arg_limit_input) { if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 || @@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) have_regex = true; arg_mode = arg_none; } + else if(arg_mode == arg_encoding) + { + if(args[i] == "UTF-8") + { + utf8_encoding = true; + } + else + { + cmOStringStream e; + e << "STRINGS option ENCODING \"" + << args[i] << "\" not recognized."; + this->SetError(e.str()); + return false; + } + arg_mode = arg_none; + } else { cmOStringStream e; @@ -596,11 +618,75 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) int output_size = 0; std::vector<std::string> strings; std::string s; - int c; while((!limit_count || strings.size() < limit_count) && (limit_input < 0 || static_cast<int>(fin.tellg()) < limit_input) && - (c = fin.get(), fin)) + fin) { + std::string current_str; + + int c = fin.get(); + + if(c == '\r') + { + // Ignore CR character to make output always have UNIX newlines. + continue; + } + + else if((c >= 0x20 && c < 0x7F) || c == '\t' || + (c == '\n' && newline_consume)) + { + // This is an ASCII character that may be part of a string. + // Cast added to avoid compiler warning. Cast is ok because + // c is guaranteed to fit in char by the above if... + current_str += static_cast<char>(c); + } + else if(utf8_encoding) + { + // Check for UTF-8 encoded string (up to 4 octets) + static const unsigned char utf8_check_table[3][2] = + { + {0xE0, 0xC0}, + {0xF0, 0xE0}, + {0xF8, 0xF0}, + }; + + // how many octets are there? + unsigned int num_utf8_bytes = 0; + for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++) + { + if((c & utf8_check_table[j][0]) == utf8_check_table[j][1]) + num_utf8_bytes = j+2; + } + + // get subsequent octets and check that they are valid + for(unsigned int j=0; j<num_utf8_bytes; j++) + { + if(j != 0) + { + c = fin.get(); + if(!fin || (c & 0xC0) != 0x80) + { + fin.putback(static_cast<char>(c)); + break; + } + } + current_str += static_cast<char>(c); + } + + // if this was an invalid utf8 sequence, discard the data, and put + // back subsequent characters + if((current_str.length() != num_utf8_bytes)) + { + for(unsigned int j=0; j<current_str.size()-1; j++) + { + c = current_str[current_str.size() - 1 - j]; + fin.putback(static_cast<char>(c)); + } + current_str = ""; + } + } + + if(c == '\n' && !newline_consume) { // The current line has been terminated. Check if the current @@ -621,26 +707,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) // Reset the string to empty. s = ""; } - else if(c == '\r') - { - // Ignore CR character to make output always have UNIX newlines. - } - else if((c >= 0x20 && c < 0x7F) || c == '\t' || - (c == '\n' && newline_consume)) + else if(current_str.empty()) { - // This is an ASCII character that may be part of a string. - // Cast added to avoid compiler warning. Cast is ok because - // c is guaranteed to fit in char by the above if... - s += static_cast<char>(c); - } - else - { - // TODO: Support ENCODING option. See issue #10519. // A non-string character has been found. Check if the current // string matches the requirements. We require that the length // be at least one no matter what the user specified. if(s.length() >= minlen && s.length() >= 1 && - (!have_regex || regex.find(s.c_str()))) + (!have_regex || regex.find(s.c_str()))) { output_size += static_cast<int>(s.size()) + 1; if(limit_output >= 0 && output_size >= limit_output) @@ -654,10 +727,15 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) // Reset the string to empty. s = ""; } + else + { + s += current_str; + } + - // Terminate a string if the maximum length is reached. if(maxlen > 0 && s.size() == maxlen) { + // Terminate a string if the maximum length is reached. if(s.length() >= minlen && (!have_regex || regex.find(s.c_str()))) { diff --git a/Tests/StringFileTest/CMakeLists.txt b/Tests/StringFileTest/CMakeLists.txt index 4fa5a86..683f969 100644 --- a/Tests/StringFileTest/CMakeLists.txt +++ b/Tests/StringFileTest/CMakeLists.txt @@ -55,6 +55,16 @@ else() "file(STRINGS) incorrectly read from srec file [${infile_strings}]") endif() +#this file has utf-8 content +file(STRINGS test.utf8 infile_strings ENCODING UTF-8) +list(LENGTH infile_strings content_len) +if(content_len MATCHES "3") + message("file(STRINGS) correctly read from utf8 file [${infile_strings}]") +else() + message(SEND_ERROR + "file(STRINGS) incorrectly read from utf8 file [${infile_strings}]") +endif() + # String test string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great") string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake") diff --git a/Tests/StringFileTest/test.utf8 b/Tests/StringFileTest/test.utf8 new file mode 100644 index 0000000..6c29170 --- /dev/null +++ b/Tests/StringFileTest/test.utf8 @@ -0,0 +1,3 @@ +The value of Ï€ (pi) is 3.141593 +Line mixed with binary partially matches valid utf8: Ï€ is à93.1593 +à
\ No newline at end of file |