summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2012-07-03 19:02:36 (GMT)
committerdgp <dgp@users.sourceforge.net>2012-07-03 19:02:36 (GMT)
commit2c0610e192513092ac8e4f99101ce66920f3feac (patch)
treeb761924b8460f93fd5d8e5e046d4e317788fc744
parent80ba6f385364c497116741643bfc008ec9bfe544 (diff)
downloadtcl-dgp_hoehrmann_decoder.zip
tcl-dgp_hoehrmann_decoder.tar.gz
tcl-dgp_hoehrmann_decoder.tar.bz2
Feature branch to explore making use of the Hoehrmann UTF-8 decoder.dgp_hoehrmann_decoder
http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
-rw-r--r--generic/tclUtf.c127
1 files changed, 52 insertions, 75 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index f0d08e7..f2771dc 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -287,89 +287,66 @@ Tcl_UniCharToUtfDString(
*---------------------------------------------------------------------------
*/
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const unsigned char utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 2,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static unsigned int
+decode(
+ unsigned int *state,
+ unsigned int *codep,
+ unsigned int byte)
+{
+ unsigned int type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state*16 + type];
+ return *state;
+}
+
int
Tcl_UtfToUniChar(
register const char *src, /* The UTF-8 string. */
register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
* the UTF-8 string. */
{
- register int byte;
-
- /*
- * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
- */
-
- byte = *((unsigned char *) src);
- if (byte < 0xC0) {
- /*
- * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
- * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
- * characters representing themselves.
- */
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
- } else if (byte < 0xE0) {
- if ((src[1] & 0xC0) == 0x80) {
- /*
- * Two-byte-character lead-byte followed by a trail-byte.
- */
-
- *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- return 2;
- }
-
- /*
- * A two-byte-character lead-byte not followed by trail-byte
- * represents itself.
- */
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
- } else if (byte < 0xF0) {
- if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
- /*
- * Three-byte-character lead byte followed by two trail bytes.
- */
-
- *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
- | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- return 3;
- }
-
- /*
- * A three-byte-character lead-byte not followed by two trail-bytes
- * represents itself.
- */
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
- }
-#if TCL_UTF_MAX > 3
- {
- int ch, total, trail;
-
- total = totalBytes[byte];
- trail = total - 1;
- if (trail > 0) {
- ch = byte & (0x3F >> trail);
- do {
- src++;
- if ((*src & 0xC0) != 0x80) {
- *chPtr = byte;
- return 1;
- }
- ch <<= 6;
- ch |= (*src & 0x3F);
- trail--;
- } while (trail > 0);
- *chPtr = ch;
- return total;
+ const char *p = src;
+ unsigned int codepoint, state = UTF8_ACCEPT;
+ int count = 1;
+
+ while (*p && count <= TCL_UTF_MAX) {
+ switch (decode(&state, &codepoint, (unsigned char)*p)) {
+ case UTF8_ACCEPT:
+ *chPtr = (Tcl_UniChar) codepoint;
+ return count;
+ case UTF8_REJECT:
+ *chPtr = (Tcl_UniChar)(*src);
+ return 1;
+ default:
+ count++;
+ p++;
}
}
-#endif
-
- *chPtr = (Tcl_UniChar) byte;
+ *chPtr = (Tcl_UniChar)(*src);
return 1;
}