summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2019-11-14 14:50:41 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2019-11-14 14:50:41 (GMT)
commit33632b09553ec01114f8181fe806b68bc312e4d3 (patch)
tree07a9b79782b6a9cd0e010e40e527e1ab4babd993
parentd085d8f0bbd1184009cd58e47223e956255db6bb (diff)
parent0cb2dc120b35cf286c565c683a271710aad03f5f (diff)
downloadtcl-33632b09553ec01114f8181fe806b68bc312e4d3.zip
tcl-33632b09553ec01114f8181fe806b68bc312e4d3.tar.gz
tcl-33632b09553ec01114f8181fe806b68bc312e4d3.tar.bz2
Merge 8.7
-rw-r--r--generic/tclEncoding.c37
-rw-r--r--tests/encoding.test22
2 files changed, 47 insertions, 12 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index c958b08..351d7ac 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2140,7 +2140,7 @@ BinaryProc(
/*
*-------------------------------------------------------------------------
*
- * UtfExtToUtfIntProc --
+ * UtfIntToUtfExtProc --
*
* Convert from UTF-8 to UTF-8. While converting null-bytes from the
* Tcl's internal representation (0xc0, 0x80) to the official
@@ -2281,7 +2281,7 @@ UtfToUtfProc(
* output buffer. */
int pureNullMode) /* Convert embedded nulls from internal
* representation to real null-bytes or vice
- * versa. */
+ * versa. Also combine or separate surrogate pairs */
{
const char *srcStart, *srcEnd, *srcClose;
const char *dstStart, *dstEnd;
@@ -2297,14 +2297,14 @@ UtfToUtfProc(
srcEnd = src + srcLen;
srcClose = srcEnd;
if ((flags & TCL_ENCODING_END) == 0) {
- srcClose -= TCL_UTF_MAX;
+ srcClose -= 6;
}
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
dstStart = dst;
- dstEnd = dst + dstLen - TCL_UTF_MAX;
+ dstEnd = dst + dstLen - ((pureNullMode == 1) ? 4 : TCL_UTF_MAX);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -2346,15 +2346,28 @@ UtfToUtfProc(
src += 1;
dst += Tcl_UniCharToUtf(*chPtr, dst);
} else {
- int len = TclUtfToUniChar(src, chPtr);
- src += len;
- dst += Tcl_UniCharToUtf(*chPtr, dst);
-#if TCL_UTF_MAX <= 4
- if ((*chPtr >= 0xD800) && (len < 3)) {
- src += Tcl_UtfToUniChar(src, chPtr);
- dst += Tcl_UniCharToUtf(*chPtr, dst);
+ src += TclUtfToUniChar(src, chPtr);
+ if ((*chPtr & 0xFC00) == 0xD800) {
+ /* A high surrogate character is detected, handle especially */
+ Tcl_UniChar low = *chPtr;
+ size_t len = Tcl_UtfToUniChar(src, &low);
+ if ((low & 0xFC00) != 0xDC00) {
+ *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
+ *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
+ *dst++ = (char) ((*chPtr | 0x80) & 0xBF);
+ continue;
+ } else if (pureNullMode == 1) {
+ int full = (((*chPtr & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
+ *dst++ = (char) (((full >> 18) | 0xF0) & 0xF7);
+ *dst++ = (char) (((full >> 12) | 0x80) & 0xBF);
+ *dst++ = (char) (((full >> 6) | 0x80) & 0xBF);
+ *dst++ = (char) ((full | 0x80) & 0xBF);
+ *chPtr = 0;
+ src += len;
+ continue;
+ }
}
-#endif
+ dst += Tcl_UniCharToUtf(*chPtr, dst);
}
}
diff --git a/tests/encoding.test b/tests/encoding.test
index da34f03..694c07b 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -321,6 +321,28 @@ test encoding-15.3 {UtfToUtfProc null character input} teststringbytes {
binary scan [teststringbytes $y] H* z
set z
} c080
+test encoding-15.4 {UtfToUtfProc emoji character input} {
+ set x \xED\xA0\xBD\xED\xB8\x82
+ set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82]
+ list [string length $x] $y
+} "6 \U1F602"
+test encoding-15.5 {UtfToUtfProc emoji character input} {
+ set x \xF0\x9F\x98\x82
+ set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
+ list [string length $x] $y
+} "4 \U1F602"
+test encoding-15.6 {UtfToUtfProc emoji character output} {
+ set x \uD83D\uDE02
+ set y [encoding convertto utf-8 \uD83D\uDE02]
+ binary scan $y H* z
+ list [string length $x] [string length $y] $z
+} {2 4 f09f9882}
+test encoding-15.7 {UtfToUtfProc emoji character output} {
+ set x \U1F602
+ set y [encoding convertto utf-8 \U1F602]
+ binary scan $y H* z
+ list [string length $y] $z
+} {4 f09f9882}
test encoding-16.1 {Utf16ToUtfProc} -body {
set val [encoding convertfrom utf-16 NN]