1 files changed, 125 insertions, 38 deletions
diff --git a/src/gtest.cc b/src/gtest.cc
index a0a0520..740eb74 100644
--- a/src/gtest.cc
+++ b/src/gtest.cc
@@ -784,16 +784,19 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
 // encoding, and streams the result to the given Message object.
 static void StreamWideCharsToMessage(const wchar_t* wstr, size_t len,
                                      Message* msg) {
-  for (size_t i = 0; i != len; i++) {
-    // TODO(wan): consider allowing a testing::String object to
-    // contain '\0'.  This will make it behave more like std::string,
-    // and will allow ToUtf8String() to return the correct encoding
-    // for '\0' s.t. we can get rid of the conditional here (and in
-    // several other places).
-    if (wstr[i]) {
-      *msg << internal::ToUtf8String(wstr[i]);
+  // TODO(wan): consider allowing a testing::String object to
+  // contain '\0'.  This will make it behave more like std::string,
+  // and will allow ToUtf8String() to return the correct encoding
+  // for '\0' s.t. we can get rid of the conditional here (and in
+  // several other places).
+  for (size_t i = 0; i != len; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, len - i);
+      while (i != len && wstr[i] != L'\0')
+        i++;
     } else {
       *msg << '\0';
+      i++;
     }
   }
 }
@@ -852,8 +855,10 @@ String FormatForFailureMessage(wchar_t wchar) {
   Message msg;
   // A String object cannot contain '\0', so we print "\\0" when wchar is
   // L'\0'.
-  msg << "L'" << (wchar ? ToUtf8String(wchar).c_str() : "\\0") << "' ("
-      << wchar_as_uint64 << ", 0x" << ::std::setbase(16)
+  char buffer[32];  // CodePointToUtf8 requires a buffer that big.
+  msg << "L'"
+      << (wchar ? CodePointToUtf8(static_cast<UInt32>(wchar), buffer) : "\\0")
+      << "' (" << wchar_as_uint64 << ", 0x" << ::std::setbase(16)
       << wchar_as_uint64 << ")";
   return msg.GetString();
 }
@@ -1317,31 +1322,118 @@ inline UInt32 ChopLowBits(UInt32* bits, int n) {
   return low_bits;
 }
 
-// Converts a Unicode code-point to its UTF-8 encoding.
-String ToUtf8String(wchar_t wchar) {
-  char str[5] = {};  // Initializes str to all '\0' characters.
-
-  UInt32 code = static_cast<UInt32>(wchar);
-  if (code <= kMaxCodePoint1) {
-    str[0] = static_cast<char>(code);                          // 0xxxxxxx
-  } else if (code <= kMaxCodePoint2) {
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xC0 | code);                   // 110xxxxx
-  } else if (code <= kMaxCodePoint3) {
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xE0 | code);                   // 1110xxxx
-  } else if (code <= kMaxCodePoint4) {
-    str[3] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xF0 | code);                   // 11110xxx
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// The output buffer str must containt at least 32 characters.
+// The function returns the address of the output buffer.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'.
+char* CodePointToUtf8(UInt32 code_point, char* str) {
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else if (code_point <= kMaxCodePoint4) {
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
   } else {
-    return String::Format("(Invalid Unicode 0x%llX)",
-                          static_cast<UInt64>(wchar));
+    // The longest string String::Format can produce when invoked
+    // with these parameters is 28 character long (not including
+    // the terminating nul character). We are asking for 32 character
+    // buffer just in case. This is also enough for strncpy to
+    // null-terminate the destination string.
+    // MSVC 8 deprecates strncpy(), so we want to suppress warning
+    // 4996 (deprecated function) there.
+#ifdef GTEST_OS_WINDOWS  // We are on Windows.
+#pragma warning(push)          // Saves the current warning state.
+#pragma warning(disable:4996)  // Temporarily disables warning 4996.
+#endif
+    strncpy(str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(),
+            32);
+#ifdef GTEST_OS_WINDOWS  // We are on Windows.
+#pragma warning(pop)           // Restores the warning state.
+#endif
+    str[31] = '\0';  // Makes sure no change in the format to strncpy leaves
+                     // the result unterminated.
   }
+  return str;
+}
+
+// The following two functions only make sense if the the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
 
-  return String(str);
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  if (sizeof(wchar_t) == 2)
+    return (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+  else
+    return false;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  if (sizeof(wchar_t) == 2) {
+    const UInt32 mask = (1 << 10) - 1;
+    return (((first & mask) << 10) | (second & mask)) + 0x10000;
+  } else {
+    // This should not be called, but we provide a sensible default
+    // in case it is.
+    return static_cast<UInt32>(first);
+  }
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+String WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = wcslen(str);
+
+  StrStream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    char buffer[32];  // CodePointToUtf8 requires a buffer this big.
+    stream << CodePointToUtf8(unicode_code_point, buffer);
+  }
+  return StrStreamToString(&stream);
 }
 
 // Converts a wide C string to a String using the UTF-8 encoding.
@@ -1349,12 +1441,7 @@ String ToUtf8String(wchar_t wchar) {
 String String::ShowWideCString(const wchar_t * wide_c_str) {
   if (wide_c_str == NULL) return String("(null)");
 
-  StrStream ss;
-  while (*wide_c_str) {
-    ss << internal::ToUtf8String(*wide_c_str++);
-  }
-
-  return internal::StrStreamToString(&ss);
+  return String(internal::WideStringToUtf8(wide_c_str, -1).c_str());
 }
 
 // Similar to ShowWideCString(), except that this function encloses