summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Source/cm_codecvt.cxx288
-rw-r--r--Source/cm_codecvt.hxx28
-rw-r--r--Source/cm_utf8.c2
3 files changed, 177 insertions, 141 deletions
diff --git a/Source/cm_codecvt.cxx b/Source/cm_codecvt.cxx
index fcd1e48..cf55741 100644
--- a/Source/cm_codecvt.cxx
+++ b/Source/cm_codecvt.cxx
@@ -1,18 +1,23 @@
/* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
file Copyright.txt or https://cmake.org/licensing for details. */
#include "cm_codecvt.hxx"
-#include <limits>
#if defined(_WIN32)
+#include <assert.h>
+#include <string.h>
#include <windows.h>
#undef max
#include "cmsys/Encoding.hxx"
#endif
+#if defined(_WIN32)
+/* Number of leading ones before a zero in the byte (see cm_utf8.c). */
+extern "C" unsigned char const cm_utf8_ones[256];
+#endif
+
codecvt::codecvt(Encoding e)
- : m_lastState(0)
#if defined(_WIN32)
- , m_codepage(0)
+ : m_codepage(0)
#endif
{
switch (e) {
@@ -45,76 +50,68 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
const char*& from_next, char* to,
char* to_end, char*& to_next) const
{
+ from_next = from;
+ to_next = to;
if (m_noconv) {
- return noconv;
+ return std::codecvt_base::noconv;
}
- std::codecvt_base::result res = error;
#if defined(_WIN32)
- from_next = from;
- to_next = to;
- bool convert = true;
- size_t count = from_end - from;
- const char* data = from;
- unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
- if (count == 0) {
- return codecvt::ok;
- } else if (count == 1) {
- if (stateId == 0) {
- // decode first byte for UTF-8
- if ((*from & 0xF8) == 0xF0 || // 1111 0xxx; 4 bytes for codepoint
- (*from & 0xF0) == 0xE0 || // 1110 xxxx; 3 bytes for codepoint
- (*from & 0xE0) == 0xC0) // 110x xxxx; 2 bytes for codepoint
- {
- stateId = findStateId();
- codecvt::State& s = m_states.at(stateId - 1);
- s.bytes[0] = *from;
- convert = false;
- if ((*from & 0xF8) == 0xF0) {
- s.totalBytes = 4;
- } else if ((*from & 0xF0) == 0xE0) {
- s.totalBytes = 3;
- } else if ((*from & 0xE0) == 0xC0) {
- s.totalBytes = 2;
- }
- s.bytesLeft = s.totalBytes - 1;
- };
- // else 1 byte for codepoint
- } else {
- codecvt::State& s = m_states.at(stateId - 1);
- s.bytes[s.totalBytes - s.bytesLeft] = *from;
- s.bytesLeft--;
- data = s.bytes;
- count = s.totalBytes - s.bytesLeft;
- if ((*from & 0xC0) == 0x80) { // 10xx xxxx
- convert = s.bytesLeft == 0;
- } else {
- // invalid multi-byte
- convert = true;
- }
- if (convert) {
- s.used = false;
- if (stateId == m_lastState) {
- m_lastState--;
- }
- stateId = 0;
- }
+ // Use a const view of the state because we should not modify it until we
+ // have fully processed and consume a byte (with sufficient space in the
+ // output buffer). We call helpers to re-cast and modify the state
+ State const& lstate = reinterpret_cast<State&>(state);
+
+ while (from_next != from_end) {
+ // Count leading ones in the bits of the next byte.
+ unsigned char const ones =
+ cm_utf8_ones[static_cast<unsigned char>(*from_next)];
+
+ if (ones != 1 && lstate.buffered != 0) {
+ // We have a buffered partial codepoint that we never completed.
+ return std::codecvt_base::error;
+ } else if (ones == 1 && lstate.buffered == 0) {
+ // This is a continuation of a codepoint that never started.
+ return std::codecvt_base::error;
+ }
+
+ // Compute the number of bytes in the current codepoint.
+ int need = 0;
+ switch (ones) {
+ case 0: // 0xxx xxxx: new codepoint of size 1
+ need = 1;
+ break;
+ case 1: // 10xx xxxx: continues a codepoint
+ assert(lstate.size != 0);
+ need = lstate.size;
+ break;
+ case 2: // 110x xxxx: new codepoint of size 2
+ need = 2;
+ break;
+ case 3: // 1110 xxxx: new codepoint of size 3
+ need = 3;
+ break;
+ case 4: // 1111 0xxx: new codepoint of size 4
+ need = 4;
+ break;
+ default: // invalid byte
+ return std::codecvt_base::error;
}
- if (convert) {
- std::wstring wide = cmsys::Encoding::ToWide(std::string(data, count));
- int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
- static_cast<int>(wide.size()), to,
- to_end - to, NULL, NULL);
- if (r > 0) {
- from_next = from_end;
- to_next = to + r;
- res = ok;
+ assert(need > 0);
+
+ if (lstate.buffered + 1 == need) {
+ // This byte completes a codepoint.
+ std::codecvt_base::result decode_result =
+ this->Decode(state, need, from_next, to_next, to_end);
+ if (decode_result != std::codecvt_base::ok) {
+ return decode_result;
}
} else {
- res = partial;
- from_next = from_end;
- to_next = to;
+ // This byte does not complete a codepoint.
+ this->BufferPartial(state, need, from_next);
}
}
+
+ return std::codecvt_base::ok;
#else
static_cast<void>(state);
static_cast<void>(from);
@@ -123,46 +120,118 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
static_cast<void>(to);
static_cast<void>(to_end);
static_cast<void>(to_next);
- res = codecvt::noconv;
+ return std::codecvt_base::noconv;
#endif
- return res;
};
std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
char* to_end,
char*& to_next) const
{
- std::codecvt_base::result res = error;
to_next = to;
+ if (m_noconv) {
+ return std::codecvt_base::noconv;
+ }
#if defined(_WIN32)
- unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
- if (stateId > 0) {
- codecvt::State& s = m_states.at(stateId - 1);
- s.used = false;
- if (stateId == m_lastState) {
- m_lastState--;
- }
- stateId = 0;
- std::wstring wide = cmsys::Encoding::ToWide(
- std::string(s.bytes, s.totalBytes - s.bytesLeft));
- int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
- static_cast<int>(wide.size()), to, to_end - to,
- NULL, NULL);
- if (r > 0) {
- to_next = to + r;
- res = ok;
- }
- } else {
- res = ok;
+ State& lstate = reinterpret_cast<State&>(state);
+ if (lstate.buffered != 0) {
+ return this->DecodePartial(state, to_next, to_end);
}
+ return std::codecvt_base::ok;
#else
static_cast<void>(state);
static_cast<void>(to_end);
- res = ok;
+ return std::codecvt_base::ok;
#endif
- return res;
};
+#if defined(_WIN32)
+std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
+ const char*& from_next,
+ char*& to_next, char* to_end) const
+{
+ State& lstate = reinterpret_cast<State&>(state);
+
+ // Collect all the bytes for this codepoint.
+ char buf[4];
+ memcpy(buf, lstate.partial, lstate.buffered);
+ buf[lstate.buffered] = *from_next;
+
+ // Convert the encoding.
+ wchar_t wbuf[2];
+ int wlen =
+ MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
+ if (wlen <= 0) {
+ return std::codecvt_base::error;
+ }
+
+ int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
+ to_end - to_next, NULL, NULL);
+ if (tlen <= 0) {
+ if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
+ return std::codecvt_base::partial;
+ }
+ return std::codecvt_base::error;
+ }
+
+ // Move past the now-consumed byte in the input buffer.
+ ++from_next;
+
+ // Move past the converted codepoint in the output buffer.
+ to_next += tlen;
+
+ // Re-initialize the state for the next codepoint to start.
+ lstate = State();
+
+ return std::codecvt_base::ok;
+}
+
+std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
+ char*& to_next,
+ char* to_end) const
+{
+ State& lstate = reinterpret_cast<State&>(state);
+
+ // Try converting the partial codepoint.
+ wchar_t wbuf[2];
+ int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
+ lstate.buffered, wbuf, 2);
+ if (wlen <= 0) {
+ return std::codecvt_base::error;
+ }
+
+ int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
+ to_end - to_next, NULL, NULL);
+ if (tlen <= 0) {
+ if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
+ return std::codecvt_base::partial;
+ }
+ return std::codecvt_base::error;
+ }
+
+ // Move past the converted codepoint in the output buffer.
+ to_next += tlen;
+
+ // Re-initialize the state for the next codepoint to start.
+ lstate = State();
+
+ return std::codecvt_base::ok;
+}
+
+void codecvt::BufferPartial(mbstate_t& state, int size,
+ const char*& from_next) const
+{
+ State& lstate = reinterpret_cast<State&>(state);
+
+ // Save the byte in our buffer for later.
+ lstate.partial[lstate.buffered++] = *from_next;
+ lstate.size = size;
+
+ // Move past the now-consumed byte in the input buffer.
+ ++from_next;
+}
+#endif
+
int codecvt::do_max_length() const throw()
{
return 4;
@@ -172,44 +241,3 @@ int codecvt::do_encoding() const throw()
{
return 0;
};
-
-unsigned int codecvt::findStateId() const
-{
- unsigned int stateId = 0;
- bool add = false;
- const unsigned int maxSize = std::numeric_limits<unsigned int>::max();
- if (m_lastState >= maxSize) {
- m_lastState = 0;
- }
- if (m_states.size() <= m_lastState) {
- add = true;
- } else {
- unsigned int i = m_lastState;
- while (i < maxSize) {
- codecvt::State& s = m_states.at(i);
- i++;
- if (!s.used) {
- m_lastState = i;
- stateId = m_lastState;
- s.used = true;
- s.totalBytes = 0;
- s.bytesLeft = 0;
- break;
- }
- if (i >= m_states.size()) {
- i = 0;
- }
- if (i == m_lastState) {
- add = true;
- break;
- }
- }
- };
- if (add) {
- codecvt::State s = { true, 0, 0, { 0, 0, 0, 0 } };
- m_states.push_back(s);
- m_lastState = (unsigned int)m_states.size();
- stateId = m_lastState;
- }
- return stateId;
-};
diff --git a/Source/cm_codecvt.hxx b/Source/cm_codecvt.hxx
index b9b52ec..30c6d54 100644
--- a/Source/cm_codecvt.hxx
+++ b/Source/cm_codecvt.hxx
@@ -6,7 +6,6 @@
#include "cmConfigure.h"
#include <locale>
-#include <vector>
#include <wchar.h>
class codecvt : public std::codecvt<char, char, mbstate_t>
@@ -35,21 +34,30 @@ protected:
int do_encoding() const throw() CM_OVERRIDE;
private:
- typedef struct
+ // The mbstate_t argument to do_out and do_unshift is responsible
+ // for storing state between calls. We cannot control the type
+ // since we want to imbue on standard streams. However, we do
+ // know that it is a trivial type. Define our own type to overlay
+ // on it safely with no alignment requirements.
+ struct State
{
- bool used;
- unsigned char totalBytes;
- unsigned char bytesLeft;
- char bytes[4];
- } State;
+ // Buffer bytes we have consumed from a partial codepoint.
+ char partial[3];
- unsigned int findStateId() const;
+ // Number of bytes we have buffered from a partial codepoint.
+ unsigned char buffered : 4;
+
+ // Size of the current codepoint in bytes.
+ unsigned char size : 4;
+ };
bool m_noconv;
- mutable std::vector<State> m_states;
- mutable unsigned int m_lastState;
#if defined(_WIN32)
unsigned int m_codepage;
+ result Decode(mbstate_t& state, int need, const char*& from_next,
+ char*& to_next, char* to_end) const;
+ result DecodePartial(mbstate_t& state, char*& to_next, char* to_end) const;
+ void BufferPartial(mbstate_t& state, int need, const char*& from_next) const;
#endif
#endif
diff --git a/Source/cm_utf8.c b/Source/cm_utf8.c
index 6c49b52..52af4a6 100644
--- a/Source/cm_utf8.c
+++ b/Source/cm_utf8.c
@@ -15,7 +15,7 @@
*/
/* Number of leading ones before a zero in the byte. */
-static unsigned char const cm_utf8_ones[256] = {
+unsigned char const cm_utf8_ones[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,