diff options
-rw-r--r-- | generic/tclUtf.c | 38 | ||||
-rw-r--r-- | tests/encoding.test | 7 | ||||
-rw-r--r-- | tests/utf.test | 15 |
3 files changed, 21 insertions, 39 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 03998de..c0de80a 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,7 +64,7 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -#if TCL_UTF_MAX != 4 +#if TCL_UTF_MAX < 4 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, #else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ @@ -384,7 +384,7 @@ Tcl_UtfToUniChar( * characters representing themselves. */ -#if TCL_UTF_MAX <= 4 +#if TCL_UTF_MAX == 4 /* If *chPtr contains a high surrogate (produced by a previous * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation * bytes, then we must produce a follow-up low surrogate. We only @@ -440,7 +440,7 @@ Tcl_UtfToUniChar( * Four-byte-character lead byte followed by at least two trail bytes. * We don't test the validity of 3th trail byte, see [ed29806ba] */ -#if TCL_UTF_MAX <= 4 +#if TCL_UTF_MAX == 4 Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; if (high < 0x400) { @@ -449,7 +449,7 @@ Tcl_UtfToUniChar( return 1; } /* out of range, < 0x10000 or > 0x10FFFF */ -#else +#elif TCL_UTF_MAX > 4 if ((src[3] & 0xC0) == 0x80) { *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); @@ -621,26 +621,12 @@ Tcl_NumUtfChars( */ while (src <= optPtr /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { -#if TCL_UTF_MAX < 4 - if (((unsigned)UCHAR(*src) - 0xF0) < 5) { - /* treat F0 - F4 as single character */ - ch = 0; - src++; - } else -#endif src += TclUtfToUniChar(src, &ch); i++; } /* Loop over the remaining string where call must happen */ while (src < endPtr) { if (Tcl_UtfCharComplete(src, endPtr - src)) { -#if TCL_UTF_MAX < 4 - if (((unsigned)UCHAR(*src) - 0xF0) < 5) { - /* treat F0 - F4 as single character */ - ch = 0; - src++; - } else -#endif src += TclUtfToUniChar(src, &ch); } else { /* @@ -1064,11 +1050,11 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) { + if (len < UtfCount(upChar)) { memmove(dst, src, len); dst += len; } else { - dst += Tcl_UniCharToUtf(upChar, dst); + dst += TclUCS4ToUtf(upChar, dst); } src += len; } @@ -1117,11 +1103,11 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) { + if (len < UtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { - dst += Tcl_UniCharToUtf(lowChar, dst); + dst += TclUCS4ToUtf(lowChar, dst); } src += len; } @@ -1167,11 +1153,11 @@ Tcl_UtfToTitle( len = TclUtfToUCS4(src, &ch); titleChar = UCS4ToTitle(ch); - if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) { + if (len < UtfCount(titleChar)) { memmove(dst, src, len); dst += len; } else { - dst += Tcl_UniCharToUtf(titleChar, dst); + dst += TclUCS4ToUtf(titleChar, dst); } src += len; } @@ -1183,11 +1169,11 @@ Tcl_UtfToTitle( lowChar = TclUCS4ToLower(lowChar); } - if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) { + if (len < UtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { - dst += Tcl_UniCharToUtf(lowChar, dst); + dst += TclUCS4ToUtf(lowChar, dst); } src += len; } diff --git a/tests/encoding.test b/tests/encoding.test index 552c97f..84f9ae1 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -335,7 +335,12 @@ test encoding-15.4 {UtfToUtfProc emoji character input} -body { set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82] list [string length $x] $y } -result "6 \uD83D\uDE02" -test encoding-15.5 {UtfToUtfProc emoji character input} { +test encoding-15.5.0 {UtfToUtfProc emoji character input} ucs2 { + set x \xF0\x9F\x98\x82 + set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] + list [string length $x] $y +} "4 \xF0\x9F\x98\x82" +test encoding-15.5.1 {UtfToUtfProc emoji character input} fullutf { set x \xF0\x9F\x98\x82 set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] list [string length $x] $y diff --git a/tests/utf.test b/tests/utf.test index 7b5cbf6..1a4b157 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -219,12 +219,9 @@ test utf-6.8 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.9.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xA0] } 1 -test utf-6.9.1 {Tcl_UtfNext} {testutfnext testbytestring utf16} { +test utf-6.9.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0] } -1 -test utf-6.9.2 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { - testutfnext [testbytestring \xA0] -} 1 test utf-6.10 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xA0]G } 1 @@ -516,21 +513,15 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbyte test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xA0\xA0\xA0] } 1 -test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring utf16} { +test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\xA0] } 3 -test utf-6.92.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs4} { - testutfnext [testbytestring \xA0\xA0\xA0] -} 1 test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \x80\x80\x80] } 1 -test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring utf16} { +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \x80\x80\x80] } 3 -test utf-6.93.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs4} { - testutfnext [testbytestring \x80\x80\x80] -} 1 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xA0\xA0\xA0\xA0] } 1 |