summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-08 15:19:43 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-08 15:19:43 (GMT)
commitdc4018e40d89363bcceca8c6ef20fc315ac734d9 (patch)
tree9ef0bded6ed11b50b26f73112b91a1a71a05045d
parent07e0e0d06a58acfb653f6f760c4a7f180c6ac456 (diff)
parent752f99d5762c8c78c12d48f81c36d7f2a4914e37 (diff)
downloadtcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.zip
tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.tar.gz
tcl-dc4018e40d89363bcceca8c6ef20fc315ac734d9.tar.bz2
Rebase to latest core-8-6-branch.
-rw-r--r--generic/tclUtf.c38
-rw-r--r--tests/encoding.test7
-rw-r--r--tests/utf.test15
3 files changed, 21 insertions, 39 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 03998de..c0de80a 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -64,7 +64,7 @@ static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-#if TCL_UTF_MAX != 4
+#if TCL_UTF_MAX < 4
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
@@ -384,7 +384,7 @@ Tcl_UtfToUniChar(
* characters representing themselves.
*/
-#if TCL_UTF_MAX <= 4
+#if TCL_UTF_MAX == 4
/* If *chPtr contains a high surrogate (produced by a previous
* Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
* bytes, then we must produce a follow-up low surrogate. We only
@@ -440,7 +440,7 @@ Tcl_UtfToUniChar(
* Four-byte-character lead byte followed by at least two trail bytes.
* We don't test the validity of 3th trail byte, see [ed29806ba]
*/
-#if TCL_UTF_MAX <= 4
+#if TCL_UTF_MAX == 4
Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
| ((src[2] & 0x3F) >> 4)) - 0x40;
if (high < 0x400) {
@@ -449,7 +449,7 @@ Tcl_UtfToUniChar(
return 1;
}
/* out of range, < 0x10000 or > 0x10FFFF */
-#else
+#elif TCL_UTF_MAX > 4
if ((src[3] & 0xC0) == 0x80) {
*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
@@ -621,26 +621,12 @@ Tcl_NumUtfChars(
*/
while (src <= optPtr
/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
-#if TCL_UTF_MAX < 4
- if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
- /* treat F0 - F4 as single character */
- ch = 0;
- src++;
- } else
-#endif
src += TclUtfToUniChar(src, &ch);
i++;
}
/* Loop over the remaining string where call must happen */
while (src < endPtr) {
if (Tcl_UtfCharComplete(src, endPtr - src)) {
-#if TCL_UTF_MAX < 4
- if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
- /* treat F0 - F4 as single character */
- ch = 0;
- src++;
- } else
-#endif
src += TclUtfToUniChar(src, &ch);
} else {
/*
@@ -1064,11 +1050,11 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) {
+ if (len < UtfCount(upChar)) {
memmove(dst, src, len);
dst += len;
} else {
- dst += Tcl_UniCharToUtf(upChar, dst);
+ dst += TclUCS4ToUtf(upChar, dst);
}
src += len;
}
@@ -1117,11 +1103,11 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
+ if (len < UtfCount(lowChar)) {
memmove(dst, src, len);
dst += len;
} else {
- dst += Tcl_UniCharToUtf(lowChar, dst);
+ dst += TclUCS4ToUtf(lowChar, dst);
}
src += len;
}
@@ -1167,11 +1153,11 @@ Tcl_UtfToTitle(
len = TclUtfToUCS4(src, &ch);
titleChar = UCS4ToTitle(ch);
- if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) {
+ if (len < UtfCount(titleChar)) {
memmove(dst, src, len);
dst += len;
} else {
- dst += Tcl_UniCharToUtf(titleChar, dst);
+ dst += TclUCS4ToUtf(titleChar, dst);
}
src += len;
}
@@ -1183,11 +1169,11 @@ Tcl_UtfToTitle(
lowChar = TclUCS4ToLower(lowChar);
}
- if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
+ if (len < UtfCount(lowChar)) {
memmove(dst, src, len);
dst += len;
} else {
- dst += Tcl_UniCharToUtf(lowChar, dst);
+ dst += TclUCS4ToUtf(lowChar, dst);
}
src += len;
}
diff --git a/tests/encoding.test b/tests/encoding.test
index 552c97f..84f9ae1 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -335,7 +335,12 @@ test encoding-15.4 {UtfToUtfProc emoji character input} -body {
set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82]
list [string length $x] $y
} -result "6 \uD83D\uDE02"
-test encoding-15.5 {UtfToUtfProc emoji character input} {
+test encoding-15.5.0 {UtfToUtfProc emoji character input} ucs2 {
+ set x \xF0\x9F\x98\x82
+ set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
+ list [string length $x] $y
+} "4 \xF0\x9F\x98\x82"
+test encoding-15.5.1 {UtfToUtfProc emoji character input} fullutf {
set x \xF0\x9F\x98\x82
set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
list [string length $x] $y
diff --git a/tests/utf.test b/tests/utf.test
index 7b5cbf6..1a4b157 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -219,12 +219,9 @@ test utf-6.8 {Tcl_UtfNext} {testutfnext testbytestring} {
test utf-6.9.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0]
} 1
-test utf-6.9.1 {Tcl_UtfNext} {testutfnext testbytestring utf16} {
+test utf-6.9.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} {
testutfnext [testbytestring \xA0]
} -1
-test utf-6.9.2 {Tcl_UtfNext} {testutfnext testbytestring ucs4} {
- testutfnext [testbytestring \xA0]
-} 1
test utf-6.10 {Tcl_UtfNext} {testutfnext testbytestring} {
testutfnext [testbytestring \xA0]G
} 1
@@ -516,21 +513,15 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbyte
test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0]
} 1
-test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring utf16} {
+test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring fullutf} {
testutfnext [testbytestring \xA0\xA0\xA0]
} 3
-test utf-6.92.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs4} {
- testutfnext [testbytestring \xA0\xA0\xA0]
-} 1
test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \x80\x80\x80]
} 1
-test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring utf16} {
+test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring fullutf} {
testutfnext [testbytestring \x80\x80\x80]
} 3
-test utf-6.93.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs4} {
- testutfnext [testbytestring \x80\x80\x80]
-} 1
test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} {
testutfnext [testbytestring \xA0\xA0\xA0\xA0]
} 1