diff options
author | dgp <dgp@users.sourceforge.net> | 2012-07-03 19:02:36 (GMT) |
---|---|---|
committer | dgp <dgp@users.sourceforge.net> | 2012-07-03 19:02:36 (GMT) |
commit | 2c0610e192513092ac8e4f99101ce66920f3feac (patch) | |
tree | b761924b8460f93fd5d8e5e046d4e317788fc744 /generic | |
parent | 80ba6f385364c497116741643bfc008ec9bfe544 (diff) | |
download | tcl-dgp_hoehrmann_decoder.zip tcl-dgp_hoehrmann_decoder.tar.gz tcl-dgp_hoehrmann_decoder.tar.bz2 |
Feature branch to explore making use of the Hoehrmann UTF-8 decoder.dgp_hoehrmann_decoder
http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclUtf.c | 127 |
1 files changed, 52 insertions, 75 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index f0d08e7..f2771dc 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -287,89 +287,66 @@ Tcl_UniCharToUtfDString( *--------------------------------------------------------------------------- */ +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const unsigned char utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 2,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +static unsigned int +decode( + unsigned int *state, + unsigned int *codep, + unsigned int byte) +{ + unsigned int type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { - register int byte; - - /* - * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. - */ - - byte = *((unsigned char *) src); - if (byte < 0xC0) { - /* - * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid - * characters representing themselves. - */ - - *chPtr = (Tcl_UniChar) byte; - return 1; - } else if (byte < 0xE0) { - if ((src[1] & 0xC0) == 0x80) { - /* - * Two-byte-character lead-byte followed by a trail-byte. - */ - - *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); - return 2; - } - - /* - * A two-byte-character lead-byte not followed by trail-byte - * represents itself. - */ - - *chPtr = (Tcl_UniChar) byte; - return 1; - } else if (byte < 0xF0) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { - /* - * Three-byte-character lead byte followed by two trail bytes. - */ - - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) - | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); - return 3; - } - - /* - * A three-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ - - *chPtr = (Tcl_UniChar) byte; - return 1; - } -#if TCL_UTF_MAX > 3 - { - int ch, total, trail; - - total = totalBytes[byte]; - trail = total - 1; - if (trail > 0) { - ch = byte & (0x3F >> trail); - do { - src++; - if ((*src & 0xC0) != 0x80) { - *chPtr = byte; - return 1; - } - ch <<= 6; - ch |= (*src & 0x3F); - trail--; - } while (trail > 0); - *chPtr = ch; - return total; + const char *p = src; + unsigned int codepoint, state = UTF8_ACCEPT; + int count = 1; + + while (*p && count <= TCL_UTF_MAX) { + switch (decode(&state, &codepoint, (unsigned char)*p)) { + case UTF8_ACCEPT: + *chPtr = (Tcl_UniChar) codepoint; + return count; + case UTF8_REJECT: + *chPtr = (Tcl_UniChar)(*src); + return 1; + default: + count++; + p++; } } -#endif - - *chPtr = (Tcl_UniChar) byte; + *chPtr = (Tcl_UniChar)(*src); return 1; } |