From 690acadc17263621f5361d48057c6f938e698a58 Mon Sep 17 00:00:00 2001 From: Brad King Date: Thu, 25 May 2017 11:26:36 -0400 Subject: codecvt: Re-implement do_out and do_unshift The previous implementation assumed that only one byte would be given in the `from` buffer by the caller at a time. This may be true for MSVC but is not for the GNU library on Windows. Re-implement these methods to handle more than one byte per call. Also simplify the state management by keeping all state between calls directly in the `mbstate_t` argument instead of using it to index our own heap-allocated state. Fixes: #16893 --- Source/cm_codecvt.cxx | 288 +++++++++++++++++++++++++++----------------------- Source/cm_codecvt.hxx | 28 +++-- Source/cm_utf8.c | 2 +- 3 files changed, 177 insertions(+), 141 deletions(-) diff --git a/Source/cm_codecvt.cxx b/Source/cm_codecvt.cxx index fcd1e48..cf55741 100644 --- a/Source/cm_codecvt.cxx +++ b/Source/cm_codecvt.cxx @@ -1,18 +1,23 @@ /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying file Copyright.txt or https://cmake.org/licensing for details. */ #include "cm_codecvt.hxx" -#include #if defined(_WIN32) +#include +#include #include #undef max #include "cmsys/Encoding.hxx" #endif +#if defined(_WIN32) +/* Number of leading ones before a zero in the byte (see cm_utf8.c). */ +extern "C" unsigned char const cm_utf8_ones[256]; +#endif + codecvt::codecvt(Encoding e) - : m_lastState(0) #if defined(_WIN32) - , m_codepage(0) + : m_codepage(0) #endif { switch (e) { @@ -45,76 +50,68 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from, const char*& from_next, char* to, char* to_end, char*& to_next) const { + from_next = from; + to_next = to; if (m_noconv) { - return noconv; + return std::codecvt_base::noconv; } - std::codecvt_base::result res = error; #if defined(_WIN32) - from_next = from; - to_next = to; - bool convert = true; - size_t count = from_end - from; - const char* data = from; - unsigned int& stateId = reinterpret_cast(state); - if (count == 0) { - return codecvt::ok; - } else if (count == 1) { - if (stateId == 0) { - // decode first byte for UTF-8 - if ((*from & 0xF8) == 0xF0 || // 1111 0xxx; 4 bytes for codepoint - (*from & 0xF0) == 0xE0 || // 1110 xxxx; 3 bytes for codepoint - (*from & 0xE0) == 0xC0) // 110x xxxx; 2 bytes for codepoint - { - stateId = findStateId(); - codecvt::State& s = m_states.at(stateId - 1); - s.bytes[0] = *from; - convert = false; - if ((*from & 0xF8) == 0xF0) { - s.totalBytes = 4; - } else if ((*from & 0xF0) == 0xE0) { - s.totalBytes = 3; - } else if ((*from & 0xE0) == 0xC0) { - s.totalBytes = 2; - } - s.bytesLeft = s.totalBytes - 1; - }; - // else 1 byte for codepoint - } else { - codecvt::State& s = m_states.at(stateId - 1); - s.bytes[s.totalBytes - s.bytesLeft] = *from; - s.bytesLeft--; - data = s.bytes; - count = s.totalBytes - s.bytesLeft; - if ((*from & 0xC0) == 0x80) { // 10xx xxxx - convert = s.bytesLeft == 0; - } else { - // invalid multi-byte - convert = true; - } - if (convert) { - s.used = false; - if (stateId == m_lastState) { - m_lastState--; - } - stateId = 0; - } + // Use a const view of the state because we should not modify it until we + // have fully processed and consume a byte (with sufficient space in the + // output buffer). We call helpers to re-cast and modify the state + State const& lstate = reinterpret_cast(state); + + while (from_next != from_end) { + // Count leading ones in the bits of the next byte. + unsigned char const ones = + cm_utf8_ones[static_cast(*from_next)]; + + if (ones != 1 && lstate.buffered != 0) { + // We have a buffered partial codepoint that we never completed. + return std::codecvt_base::error; + } else if (ones == 1 && lstate.buffered == 0) { + // This is a continuation of a codepoint that never started. + return std::codecvt_base::error; + } + + // Compute the number of bytes in the current codepoint. + int need = 0; + switch (ones) { + case 0: // 0xxx xxxx: new codepoint of size 1 + need = 1; + break; + case 1: // 10xx xxxx: continues a codepoint + assert(lstate.size != 0); + need = lstate.size; + break; + case 2: // 110x xxxx: new codepoint of size 2 + need = 2; + break; + case 3: // 1110 xxxx: new codepoint of size 3 + need = 3; + break; + case 4: // 1111 0xxx: new codepoint of size 4 + need = 4; + break; + default: // invalid byte + return std::codecvt_base::error; } - if (convert) { - std::wstring wide = cmsys::Encoding::ToWide(std::string(data, count)); - int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(), - static_cast(wide.size()), to, - to_end - to, NULL, NULL); - if (r > 0) { - from_next = from_end; - to_next = to + r; - res = ok; + assert(need > 0); + + if (lstate.buffered + 1 == need) { + // This byte completes a codepoint. + std::codecvt_base::result decode_result = + this->Decode(state, need, from_next, to_next, to_end); + if (decode_result != std::codecvt_base::ok) { + return decode_result; } } else { - res = partial; - from_next = from_end; - to_next = to; + // This byte does not complete a codepoint. + this->BufferPartial(state, need, from_next); } } + + return std::codecvt_base::ok; #else static_cast(state); static_cast(from); @@ -123,46 +120,118 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from, static_cast(to); static_cast(to_end); static_cast(to_next); - res = codecvt::noconv; + return std::codecvt_base::noconv; #endif - return res; }; std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to, char* to_end, char*& to_next) const { - std::codecvt_base::result res = error; to_next = to; + if (m_noconv) { + return std::codecvt_base::noconv; + } #if defined(_WIN32) - unsigned int& stateId = reinterpret_cast(state); - if (stateId > 0) { - codecvt::State& s = m_states.at(stateId - 1); - s.used = false; - if (stateId == m_lastState) { - m_lastState--; - } - stateId = 0; - std::wstring wide = cmsys::Encoding::ToWide( - std::string(s.bytes, s.totalBytes - s.bytesLeft)); - int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(), - static_cast(wide.size()), to, to_end - to, - NULL, NULL); - if (r > 0) { - to_next = to + r; - res = ok; - } - } else { - res = ok; + State& lstate = reinterpret_cast(state); + if (lstate.buffered != 0) { + return this->DecodePartial(state, to_next, to_end); } + return std::codecvt_base::ok; #else static_cast(state); static_cast(to_end); - res = ok; + return std::codecvt_base::ok; #endif - return res; }; +#if defined(_WIN32) +std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size, + const char*& from_next, + char*& to_next, char* to_end) const +{ + State& lstate = reinterpret_cast(state); + + // Collect all the bytes for this codepoint. + char buf[4]; + memcpy(buf, lstate.partial, lstate.buffered); + buf[lstate.buffered] = *from_next; + + // Convert the encoding. + wchar_t wbuf[2]; + int wlen = + MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2); + if (wlen <= 0) { + return std::codecvt_base::error; + } + + int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next, + to_end - to_next, NULL, NULL); + if (tlen <= 0) { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { + return std::codecvt_base::partial; + } + return std::codecvt_base::error; + } + + // Move past the now-consumed byte in the input buffer. + ++from_next; + + // Move past the converted codepoint in the output buffer. + to_next += tlen; + + // Re-initialize the state for the next codepoint to start. + lstate = State(); + + return std::codecvt_base::ok; +} + +std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state, + char*& to_next, + char* to_end) const +{ + State& lstate = reinterpret_cast(state); + + // Try converting the partial codepoint. + wchar_t wbuf[2]; + int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial, + lstate.buffered, wbuf, 2); + if (wlen <= 0) { + return std::codecvt_base::error; + } + + int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next, + to_end - to_next, NULL, NULL); + if (tlen <= 0) { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { + return std::codecvt_base::partial; + } + return std::codecvt_base::error; + } + + // Move past the converted codepoint in the output buffer. + to_next += tlen; + + // Re-initialize the state for the next codepoint to start. + lstate = State(); + + return std::codecvt_base::ok; +} + +void codecvt::BufferPartial(mbstate_t& state, int size, + const char*& from_next) const +{ + State& lstate = reinterpret_cast(state); + + // Save the byte in our buffer for later. + lstate.partial[lstate.buffered++] = *from_next; + lstate.size = size; + + // Move past the now-consumed byte in the input buffer. + ++from_next; +} +#endif + int codecvt::do_max_length() const throw() { return 4; @@ -172,44 +241,3 @@ int codecvt::do_encoding() const throw() { return 0; }; - -unsigned int codecvt::findStateId() const -{ - unsigned int stateId = 0; - bool add = false; - const unsigned int maxSize = std::numeric_limits::max(); - if (m_lastState >= maxSize) { - m_lastState = 0; - } - if (m_states.size() <= m_lastState) { - add = true; - } else { - unsigned int i = m_lastState; - while (i < maxSize) { - codecvt::State& s = m_states.at(i); - i++; - if (!s.used) { - m_lastState = i; - stateId = m_lastState; - s.used = true; - s.totalBytes = 0; - s.bytesLeft = 0; - break; - } - if (i >= m_states.size()) { - i = 0; - } - if (i == m_lastState) { - add = true; - break; - } - } - }; - if (add) { - codecvt::State s = { true, 0, 0, { 0, 0, 0, 0 } }; - m_states.push_back(s); - m_lastState = (unsigned int)m_states.size(); - stateId = m_lastState; - } - return stateId; -}; diff --git a/Source/cm_codecvt.hxx b/Source/cm_codecvt.hxx index b9b52ec..30c6d54 100644 --- a/Source/cm_codecvt.hxx +++ b/Source/cm_codecvt.hxx @@ -6,7 +6,6 @@ #include "cmConfigure.h" #include -#include #include class codecvt : public std::codecvt @@ -35,21 +34,30 @@ protected: int do_encoding() const throw() CM_OVERRIDE; private: - typedef struct + // The mbstate_t argument to do_out and do_unshift is responsible + // for storing state between calls. We cannot control the type + // since we want to imbue on standard streams. However, we do + // know that it is a trivial type. Define our own type to overlay + // on it safely with no alignment requirements. + struct State { - bool used; - unsigned char totalBytes; - unsigned char bytesLeft; - char bytes[4]; - } State; + // Buffer bytes we have consumed from a partial codepoint. + char partial[3]; - unsigned int findStateId() const; + // Number of bytes we have buffered from a partial codepoint. + unsigned char buffered : 4; + + // Size of the current codepoint in bytes. + unsigned char size : 4; + }; bool m_noconv; - mutable std::vector m_states; - mutable unsigned int m_lastState; #if defined(_WIN32) unsigned int m_codepage; + result Decode(mbstate_t& state, int need, const char*& from_next, + char*& to_next, char* to_end) const; + result DecodePartial(mbstate_t& state, char*& to_next, char* to_end) const; + void BufferPartial(mbstate_t& state, int need, const char*& from_next) const; #endif #endif diff --git a/Source/cm_utf8.c b/Source/cm_utf8.c index 6c49b52..52af4a6 100644 --- a/Source/cm_utf8.c +++ b/Source/cm_utf8.c @@ -15,7 +15,7 @@ */ /* Number of leading ones before a zero in the byte. */ -static unsigned char const cm_utf8_ones[256] = { +unsigned char const cm_utf8_ones[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- cgit v0.12