/* Distributed under the OSI-approved BSD 3-Clause License. See accompanying file Copyright.txt or https://cmake.org/licensing for details. */ #include "cm_codecvt.hxx" #if defined(_WIN32) # include # include # include # undef max # include "cmsys/Encoding.hxx" #endif #if defined(_WIN32) /* Number of leading ones before a zero in the byte (see cm_utf8.c). */ extern "C" unsigned char const cm_utf8_ones[256]; #endif codecvt::codecvt(Encoding e) #if defined(_WIN32) : m_codepage(0) #endif { switch (e) { case codecvt::ANSI: #if defined(_WIN32) m_noconv = false; m_codepage = CP_ACP; break; #endif // We don't know which ANSI encoding to use for other platforms than // Windows so we don't do any conversion there case codecvt::UTF8: case codecvt::UTF8_WITH_BOM: // Assume internal encoding is UTF-8 case codecvt::None: // No encoding default: this->m_noconv = true; } } codecvt::~codecvt() = default; bool codecvt::do_always_noconv() const noexcept { return this->m_noconv; } std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from, const char* from_end, const char*& from_next, char* to, char* to_end, char*& to_next) const { from_next = from; to_next = to; if (this->m_noconv) { return std::codecvt_base::noconv; } #if defined(_WIN32) // Use a const view of the state because we should not modify it until we // have fully processed and consume a byte (with sufficient space in the // output buffer). We call helpers to re-cast and modify the state State const& lstate = reinterpret_cast(state); while (from_next != from_end) { // Count leading ones in the bits of the next byte. unsigned char const ones = cm_utf8_ones[static_cast(*from_next)]; if (ones != 1 && lstate.buffered != 0) { // We have a buffered partial codepoint that we never completed. return std::codecvt_base::error; } else if (ones == 1 && lstate.buffered == 0) { // This is a continuation of a codepoint that never started. return std::codecvt_base::error; } // Compute the number of bytes in the current codepoint. int need = 0; switch (ones) { case 0: // 0xxx xxxx: new codepoint of size 1 need = 1; break; case 1: // 10xx xxxx: continues a codepoint assert(lstate.size != 0); need = lstate.size; break; case 2: // 110x xxxx: new codepoint of size 2 need = 2; break; case 3: // 1110 xxxx: new codepoint of size 3 need = 3; break; case 4: // 1111 0xxx: new codepoint of size 4 need = 4; break; default: // invalid byte return std::codecvt_base::error; } assert(need > 0); if (lstate.buffered + 1 == need) { // This byte completes a codepoint. std::codecvt_base::result decode_result = this->Decode(state, need, from_next, to_next, to_end); if (decode_result != std::codecvt_base::ok) { return decode_result; } } else { // This byte does not complete a codepoint. this->BufferPartial(state, need, from_next); } } return std::codecvt_base::ok; #else static_cast(state); static_cast(from); static_cast(from_end); static_cast(from_next); static_cast(to); static_cast(to_end); static_cast(to_next); return std::codecvt_base::noconv; #endif } std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to, char* to_end, char*& to_next) const { to_next = to; if (this->m_noconv) { return std::codecvt_base::noconv; } #if defined(_WIN32) State& lstate = reinterpret_cast(state); if (lstate.buffered != 0) { return this->DecodePartial(state, to_next, to_end); } return std::codecvt_base::ok; #else static_cast(state); static_cast(to_end); return std::codecvt_base::ok; #endif } #if defined(_WIN32) std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size, const char*& from_next, char*& to_next, char* to_end) const { State& lstate = reinterpret_cast(state); // Collect all the bytes for this codepoint. char buf[4]; memcpy(buf, lstate.partial, lstate.buffered); buf[lstate.buffered] = *from_next; // Convert the encoding. wchar_t wbuf[2]; int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2); if (wlen <= 0) { return std::codecvt_base::error; } int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next, to_end - to_next, NULL, NULL); if (tlen <= 0) { if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { return std::codecvt_base::partial; } return std::codecvt_base::error; } // Move past the now-consumed byte in the input buffer. ++from_next; // Move past the converted codepoint in the output buffer. to_next += tlen; // Re-initialize the state for the next codepoint to start. lstate = State(); return std::codecvt_base::ok; } std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state, char*& to_next, char* to_end) const { State& lstate = reinterpret_cast(state); // Try converting the partial codepoint. wchar_t wbuf[2]; int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial, lstate.buffered, wbuf, 2); if (wlen <= 0) { return std::codecvt_base::error; } int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next, to_end - to_next, NULL, NULL); if (tlen <= 0) { if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { return std::codecvt_base::partial; } return std::codecvt_base::error; } // Move past the converted codepoint in the output buffer. to_next += tlen; // Re-initialize the state for the next codepoint to start. lstate = State(); return std::codecvt_base::ok; } void codecvt::BufferPartial(mbstate_t& state, int size, const char*& from_next) const { State& lstate = reinterpret_cast(state); // Save the byte in our buffer for later. lstate.partial[lstate.buffered++] = *from_next; lstate.size = size; // Move past the now-consumed byte in the input buffer. ++from_next; } #endif int codecvt::do_max_length() const noexcept { return 4; } int codecvt::do_encoding() const noexcept { return 0; }