diff options
author | Justin Borodinsky <justin.borodinsky@gmail.com> | 2015-01-11 19:33:36 (GMT) |
---|---|---|
committer | Brad King <brad.king@kitware.com> | 2015-01-27 16:30:26 (GMT) |
commit | 1f77a7001b2e3f8f9224cb603e5acfee45573064 (patch) | |
tree | 6e641ed7b6967267135c544adea91815523dd753 | |
parent | 19e57a48cd1ad562b277c8fb9dc8285ef96acfa0 (diff) | |
download | CMake-1f77a7001b2e3f8f9224cb603e5acfee45573064.zip CMake-1f77a7001b2e3f8f9224cb603e5acfee45573064.tar.gz CMake-1f77a7001b2e3f8f9224cb603e5acfee45573064.tar.bz2 |
file: Teach STRINGS to support UTF-16 and UTF-32 encodings
-rw-r--r-- | Help/command/file.rst | 5 | ||||
-rw-r--r-- | Help/release/dev/file-strings-utf-16.rst | 5 | ||||
-rw-r--r-- | Source/cmFileCommand.cxx | 64 | ||||
-rw-r--r-- | Tests/RunCMake/string/RunCMakeTest.cmake | 5 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-16BE-stderr.txt | 2 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-16BE.cmake | 4 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-16BE.txt | bin | 0 -> 83 bytes | |||
-rw-r--r-- | Tests/RunCMake/string/UTF-16LE-stderr.txt | 2 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-16LE.cmake | 4 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-16LE.txt | bin | 0 -> 83 bytes | |||
-rw-r--r-- | Tests/RunCMake/string/UTF-32BE-stderr.txt | 2 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-32BE.cmake | 4 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-32BE.txt | bin | 0 -> 165 bytes | |||
-rw-r--r-- | Tests/RunCMake/string/UTF-32LE-stderr.txt | 2 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-32LE.cmake | 4 | ||||
-rw-r--r-- | Tests/RunCMake/string/UTF-32LE.txt | bin | 0 -> 165 bytes |
16 files changed, 99 insertions, 4 deletions
diff --git a/Help/command/file.rst b/Help/command/file.rst index b0d4792..73d4cfa 100644 --- a/Help/command/file.rst +++ b/Help/command/file.rst @@ -65,7 +65,10 @@ Parse a list of ASCII strings from ``<filename>`` and store it in Consider only strings that match the given regular expression. ``ENCODING <encoding-type>`` - Consider strings of a given encoding. "UTF-8" is currently supported. + Consider strings of a given encoding. Currently supported encodings are: + UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE. If the ENCODING option + is not provided and the file has a Byte Order Mark, the ENCODING option + will be defaulted to respect the Byte Order Mark. For example, the code diff --git a/Help/release/dev/file-strings-utf-16.rst b/Help/release/dev/file-strings-utf-16.rst new file mode 100644 index 0000000..f40b63e --- /dev/null +++ b/Help/release/dev/file-strings-utf-16.rst @@ -0,0 +1,5 @@ +file-strings-utf-16 +------------------- + +* The :command:`file(STRINGS)` now supports UTF-16LE, UTF-16BE, + UTF-32LE, UTF-32BE as ``ENCODING`` options. diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx index f125292..579e715 100644 --- a/Source/cmFileCommand.cxx +++ b/Source/cmFileCommand.cxx @@ -472,7 +472,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) bool have_regex = false; bool newline_consume = false; bool hex_conversion_enabled = true; - bool utf8_encoding = false; + enum { encoding_none = cmsys::FStream::BOM_None, + encoding_utf8 = cmsys::FStream::BOM_UTF8, + encoding_utf16le = cmsys::FStream::BOM_UTF16LE, + encoding_utf16be = cmsys::FStream::BOM_UTF16BE, + encoding_utf32le = cmsys::FStream::BOM_UTF32LE, + encoding_utf32be = cmsys::FStream::BOM_UTF32BE}; + int encoding = encoding_none; int arg_mode = arg_none; for(unsigned int i=3; i < args.size(); ++i) { @@ -599,7 +605,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) { if(args[i] == "UTF-8") { - utf8_encoding = true; + encoding = encoding_utf8; + } + else if(args[i] == "UTF-16LE") + { + encoding = encoding_utf16le; + } + else if(args[i] == "UTF-16BE") + { + encoding = encoding_utf16be; + } + else if(args[i] == "UTF-32LE") + { + encoding = encoding_utf32le; + } + else if(args[i] == "UTF-32BE") + { + encoding = encoding_utf32be; } else { @@ -647,6 +669,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) return false; } + //If BOM is found and encoding was not specified, use the BOM + int bom_found = cmsys::FStream::ReadBOM(fin); + if(encoding == encoding_none && bom_found != cmsys::FStream::BOM_None) + { + encoding = bom_found; + } + + unsigned int bytes_rem = 0; + if(encoding == encoding_utf16le || encoding == encoding_utf16be) + { + bytes_rem = 1; + } + if(encoding == encoding_utf32le || encoding == encoding_utf32be) + { + bytes_rem = 3; + } + // Parse strings out of the file. int output_size = 0; std::vector<std::string> strings; @@ -658,6 +697,25 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) std::string current_str; int c = fin.get(); + for(unsigned int i=0; i<bytes_rem; ++i) + { + int c1 = fin.get(); + if(!fin) + { + fin.putback(static_cast<char>(c1)); + break; + } + c = (c << 8) | c1; + } + if(encoding == encoding_utf16le) + { + c = ((c & 0xFF) << 8) | ((c & 0xFF00) >> 8); + } + else if(encoding == encoding_utf32le) + { + c = (((c & 0xFF) << 24) | ((c & 0xFF00) << 8) | + ((c & 0xFF0000) >> 8) | ((c & 0xFF000000) >> 24)); + } if(c == '\r') { @@ -673,7 +731,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args) // c is guaranteed to fit in char by the above if... current_str += static_cast<char>(c); } - else if(utf8_encoding) + else if(encoding == encoding_utf8) { // Check for UTF-8 encoded string (up to 4 octets) static const unsigned char utf8_check_table[3][2] = diff --git a/Tests/RunCMake/string/RunCMakeTest.cmake b/Tests/RunCMake/string/RunCMakeTest.cmake index fc913c6..89f7ea5 100644 --- a/Tests/RunCMake/string/RunCMakeTest.cmake +++ b/Tests/RunCMake/string/RunCMakeTest.cmake @@ -12,3 +12,8 @@ run_cmake(UuidMissingTypeValue) run_cmake(UuidBadType) run_cmake(RegexClear) + +run_cmake(UTF-16BE) +run_cmake(UTF-16LE) +run_cmake(UTF-32BE) +run_cmake(UTF-32LE) diff --git a/Tests/RunCMake/string/UTF-16BE-stderr.txt b/Tests/RunCMake/string/UTF-16BE-stderr.txt new file mode 100644 index 0000000..8254f87 --- /dev/null +++ b/Tests/RunCMake/string/UTF-16BE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-16BE.cmake b/Tests/RunCMake/string/UTF-16BE.cmake new file mode 100644 index 0000000..da986c0 --- /dev/null +++ b/Tests/RunCMake/string/UTF-16BE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-16BE.txt str ENCODING UTF-16BE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-16BE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-16BE.txt b/Tests/RunCMake/string/UTF-16BE.txt Binary files differnew file mode 100644 index 0000000..9d976bc --- /dev/null +++ b/Tests/RunCMake/string/UTF-16BE.txt diff --git a/Tests/RunCMake/string/UTF-16LE-stderr.txt b/Tests/RunCMake/string/UTF-16LE-stderr.txt new file mode 100644 index 0000000..8254f87 --- /dev/null +++ b/Tests/RunCMake/string/UTF-16LE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-16LE.cmake b/Tests/RunCMake/string/UTF-16LE.cmake new file mode 100644 index 0000000..326d848 --- /dev/null +++ b/Tests/RunCMake/string/UTF-16LE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-16LE.txt str ENCODING UTF-16LE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-16LE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-16LE.txt b/Tests/RunCMake/string/UTF-16LE.txt Binary files differnew file mode 100644 index 0000000..ebba874 --- /dev/null +++ b/Tests/RunCMake/string/UTF-16LE.txt diff --git a/Tests/RunCMake/string/UTF-32BE-stderr.txt b/Tests/RunCMake/string/UTF-32BE-stderr.txt new file mode 100644 index 0000000..8254f87 --- /dev/null +++ b/Tests/RunCMake/string/UTF-32BE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-32BE.cmake b/Tests/RunCMake/string/UTF-32BE.cmake new file mode 100644 index 0000000..debdeaa --- /dev/null +++ b/Tests/RunCMake/string/UTF-32BE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-32BE.txt str ENCODING UTF-32BE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-32BE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-32BE.txt b/Tests/RunCMake/string/UTF-32BE.txt Binary files differnew file mode 100644 index 0000000..6725fbb --- /dev/null +++ b/Tests/RunCMake/string/UTF-32BE.txt diff --git a/Tests/RunCMake/string/UTF-32LE-stderr.txt b/Tests/RunCMake/string/UTF-32LE-stderr.txt new file mode 100644 index 0000000..8254f87 --- /dev/null +++ b/Tests/RunCMake/string/UTF-32LE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-32LE.cmake b/Tests/RunCMake/string/UTF-32LE.cmake new file mode 100644 index 0000000..22aab5f --- /dev/null +++ b/Tests/RunCMake/string/UTF-32LE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-32LE.txt str ENCODING UTF-32LE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-32LE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-32LE.txt b/Tests/RunCMake/string/UTF-32LE.txt Binary files differnew file mode 100644 index 0000000..cf5102f --- /dev/null +++ b/Tests/RunCMake/string/UTF-32LE.txt |