summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2018-06-24 20:27:19 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2018-06-24 20:27:19 (GMT)
commit6264f4978838b089748579e436d35b30a4e56201 (patch)
treec4a09f09a5dd9f7c2ac319f5f1ec231f7cc66353
parent9d19f3532ce6a01e00e85a877145088683cde4f4 (diff)
parent94f9cf81ed3e156bd372a3cac249974d1acb4e1d (diff)
downloadtcl-6264f4978838b089748579e436d35b30a4e56201.zip
tcl-6264f4978838b089748579e436d35b30a4e56201.tar.gz
tcl-6264f4978838b089748579e436d35b30a4e56201.tar.bz2
Merge 8.7
-rw-r--r--doc/Utf.317
-rw-r--r--generic/tclCmdMZ.c3
-rw-r--r--generic/tclDisassemble.c2
-rw-r--r--generic/tclEnv.c13
-rw-r--r--generic/tclExecute.c14
-rw-r--r--generic/tclParse.c11
-rw-r--r--generic/tclStringObj.c4
-rw-r--r--generic/tclUtf.c28
-rw-r--r--tests/pkgIndex.tcl8
-rw-r--r--tests/utf.test27
10 files changed, 98 insertions, 29 deletions
diff --git a/doc/Utf.3 b/doc/Utf.3
index 78d795e..922fd81 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -121,8 +121,8 @@ case-insensitive (1).
.SH DESCRIPTION
.PP
-These routines convert between UTF-8 strings and Tcl_UniChars. A
-Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
+These routines convert between UTF-8 strings and Unicode characters. An
+Unicode character represented as an unsigned, fixed-size
quantity. A UTF-8 character is a Unicode character represented as
a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes. A multibyte UTF-8
sequence consists of a lead byte followed by some number of trail bytes.
@@ -130,9 +130,12 @@ sequence consists of a lead byte followed by some number of trail bytes.
\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
represent one Unicode character in the UTF-8 representation.
.PP
-\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string
+\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string
in starting at \fIbuf\fR. The return value is the number of bytes stored
-in \fIbuf\fR.
+in \fIbuf\fR. If ch is an upper surrogate (range U+D800 - U+DBFF), then
+the return value will be 0 and nothing will be stored. If you still
+want to produce UTF-8 output for it (even though knowing it's an illegal
+code-point on its own), just call \fBTcl_UniCharToUtf\fR again using ch = -1.
.PP
\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the
@@ -203,7 +206,7 @@ of \fIlength\fR bytes is long enough to be decoded by
\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee
that the UTF-8 string is properly formed. This routine is used by
procedures that are operating on a byte at a time and need to know if a
-full Tcl_UniChar has been seen.
+full Unicode character has been seen.
.PP
\fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It
returns the number of Tcl_UniChars that are represented by the UTF-8 string
@@ -211,12 +214,12 @@ returns the number of Tcl_UniChars that are represented by the UTF-8 string
length is negative, all bytes up to the first null byte are used.
.PP
\fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It
-returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR
+returns a pointer to the first occurrence of the Unicode character \fIch\fR
in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is
considered part of the UTF-8 string.
.PP
\fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It
-returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
+returns a pointer to the last occurrence of the Unicode character \fIch\fR
in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is
considered part of the UTF-8 string.
.PP
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 09d7963..684daeb 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1447,6 +1447,9 @@ StringIndexCmd(
char buf[4];
length = Tcl_UniCharToUtf(ch, buf);
+ if (!length) {
+ length = Tcl_UniCharToUtf(-1, buf);
+ }
Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
}
}
diff --git a/generic/tclDisassemble.c b/generic/tclDisassemble.c
index 77b18df..fe18119 100644
--- a/generic/tclDisassemble.c
+++ b/generic/tclDisassemble.c
@@ -894,7 +894,7 @@ PrintSourceToObj(
Tcl_AppendPrintfToObj(appendObj, "\\U%08x", ch);
i += 10;
} else
-#elif TCL_UTF_MAX > 3
+#else
/* If len == 0, this means we have a char > 0xffff, resulting in
* TclUtfToUniChar producing a surrogate pair. We want to output
* this pair as a single Unicode character.
diff --git a/generic/tclEnv.c b/generic/tclEnv.c
index 8cc4b74..40ced17 100644
--- a/generic/tclEnv.c
+++ b/generic/tclEnv.c
@@ -723,14 +723,25 @@ TclFinalizeEnvironment(void)
* strings. This may leak more memory that strictly necessary, since some
* of the strings may no longer be in the environment. However,
* determining which ones are ok to delete is n-squared, and is pretty
- * unlikely, so we don't bother.
+ * unlikely, so we don't bother. However, in the case of DPURIFY, just
+ * free all strings in the cache.
*/
if (env.cache) {
+#ifdef PURIFY
+ int i;
+ for (i = 0; i < env.cacheSize; i++) {
+ ckfree(env.cache[i]);
+ }
+#endif
ckfree(env.cache);
env.cache = NULL;
env.cacheSize = 0;
#ifndef USE_PUTENV
+ if ((env.ourEnviron != NULL)) {
+ ckfree(env.ourEnviron);
+ env.ourEnviron = NULL;
+ }
env.ourEnvironSize = 0;
#endif
}
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index 8d10e6b..b3ead9d 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -4754,7 +4754,7 @@ TEBCresume(
/* Decode index value operands. */
- /*
+ /*
assert ( toIdx != TCL_INDEX_AFTER);
*
* Extra safety for legacy bytecodes:
@@ -5013,9 +5013,15 @@ TEBCresume(
* but creating the object as a string seems to be faster in
* practical use.
*/
-
- length = (ch != -1) ? Tcl_UniCharToUtf(ch, buf) : 0;
- objResultPtr = Tcl_NewStringObj(buf, length);
+ if (ch == -1) {
+ objResultPtr = Tcl_NewObj();
+ } else {
+ length = Tcl_UniCharToUtf(ch, buf);
+ if (!length) {
+ length = Tcl_UniCharToUtf(-1, buf);
+ }
+ objResultPtr = Tcl_NewStringObj(buf, length);
+ }
}
TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr)));
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 92482d6..00b83a1 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -979,13 +979,12 @@ TclParseBackslash(
if (readPtr != NULL) {
*readPtr = count;
}
- if ((result & 0xFC00) == 0xD800) {
- dst[2] = (char) ((result | 0x80) & 0xBF);
- dst[1] = (char) (((result >> 6) | 0x80) & 0xBF);
- dst[0] = (char) ((result >> 12) | 0xE0);
- return 3;
+ count = Tcl_UniCharToUtf(result, dst);
+ if (!count) {
+ /* Special case for handling upper surrogates. */
+ count = Tcl_UniCharToUtf(-1, dst);
}
- return Tcl_UniCharToUtf(result, dst);
+ return count;
}
/*
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 8ccd189..fb33821 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -2014,6 +2014,10 @@ Tcl_AppendFormatToObj(
goto error;
}
length = Tcl_UniCharToUtf(code, buf);
+ if (!length) {
+ /* Special case for handling upper surrogates. */
+ length = Tcl_UniCharToUtf(-1, buf);
+ }
segment = Tcl_NewStringObj(buf, length);
Tcl_IncrRefCount(segment);
allocSegment = 1;
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 693e210..c8292a2 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -173,6 +173,13 @@ Tcl_UniCharToUtf(
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
+ } else if (ch == -1) {
+ if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+ && ((buf[2] & 0xCF) == 0)) {
+ ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2)
+ + ((buf[2] & 0x30) >> 4);
+ goto three;
+ }
}
ch = 0xFFFD;
@@ -211,7 +218,7 @@ Tcl_UniCharToUtfDString(
{
const Tcl_UniChar *w, *wEnd;
char *p, *string;
- int oldLength;
+ int oldLength, len = 1;
/*
* UTF-8 string length in bytes will be <= Unicode string length * 4.
@@ -224,9 +231,18 @@ Tcl_UniCharToUtfDString(
p = string;
wEnd = uniStr + uniLength;
for (w = uniStr; w < wEnd; ) {
- p += Tcl_UniCharToUtf(*w, p);
+ if (!len && ((*w & 0xFC00) != 0xDC00)) {
+ /* Special case for handling upper surrogates. */
+ p += Tcl_UniCharToUtf(-1, p);
+ }
+ len = Tcl_UniCharToUtf(*w, p);
+ p += len;
w++;
}
+ if (!len) {
+ /* Special case for handling upper surrogates. */
+ p += Tcl_UniCharToUtf(-1, p);
+ }
Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
return string;
@@ -892,7 +908,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(upChar)) {
+ if ((bytes < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -955,7 +971,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(lowChar)) {
+ if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -1015,7 +1031,7 @@ Tcl_UtfToTitle(
#endif
titleChar = Tcl_UniCharToTitle(titleChar);
- if (bytes < TclUtfCount(titleChar)) {
+ if ((bytes < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -1039,7 +1055,7 @@ Tcl_UtfToTitle(
lowChar = Tcl_UniCharToLower(lowChar);
}
- if (bytes < TclUtfCount(lowChar)) {
+ if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
diff --git a/tests/pkgIndex.tcl b/tests/pkgIndex.tcl
index 48ab71b..0feb0eb 100644
--- a/tests/pkgIndex.tcl
+++ b/tests/pkgIndex.tcl
@@ -1,6 +1,6 @@
#! /usr/bin/env tclsh
-package ifneeded tcltests 0.1 {
- source [file dirname [file dirname [file normalize [info script]/...]]]/tcltests.tcl
- package provide tcltests 0.1
-}
+package ifneeded tcltests 0.1 "
+ source [list $dir]/tcltests.tcl
+ package provide tcltests 0.1
+"
diff --git a/tests/utf.test b/tests/utf.test
index 9dd8017..e820359 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -44,6 +44,18 @@ test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring {
test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints testbytestring -body {
expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]}
} -result 1
+test utf-1.8 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring {
+ expr {"\ud842" eq [testbytestring "\xed\xa1\x82"]}
+} 1
+test utf-1.9 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring {
+ expr {"\udc42" eq [testbytestring "\xed\xb1\x82"]}
+} 1
+test utf-1.10 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring {
+ expr {[format %c 0xd842] eq [testbytestring "\xed\xa1\x82"]}
+} 1
+test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring {
+ expr {[format %c 0xdc42] eq [testbytestring "\xed\xb1\x82"]}
+} 1
test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
string length "abc"
@@ -146,6 +158,12 @@ test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
string index \u4e4e\u25a\xff\u543 2
} "\uff"
+test utf-8.5 {Tcl_UniCharAtIndex: upper surrogate} {
+ string index \ud842 0
+} "\ud842"
+test utf-8.5 {Tcl_UniCharAtIndex: lower surrogate} {
+ string index \udc42 0
+} "\udc42"
test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
string range abcd 0 2
@@ -251,6 +269,9 @@ test utf-11.4 {Tcl_UtfToUpper} {
test utf-11.5 {Tcl_UtfToUpper Georgian (new in Unicode 11)} {
string toupper \u10d0\u1c90
} \u1c90\u1c90
+test utf-11.6 {Tcl_UtfToUpper low/high surrogate)} {
+ string toupper \udc24\ud824
+} \udc24\ud824
test utf-12.1 {Tcl_UtfToLower} {
string tolower {}
@@ -267,6 +288,9 @@ test utf-12.4 {Tcl_UtfToLower} {
test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} {
string tolower \u10d0\u1c90
} \u10d0\u10d0
+test utf-12.6 {Tcl_UtfToUpper low/high surrogate)} {
+ string tolower \udc24\ud824
+} \udc24\ud824
test utf-13.1 {Tcl_UtfToTitle} {
string totitle {}
@@ -286,6 +310,9 @@ test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
string totitle \u1c90\u10d0
} \u1c90\u10d0
+test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} {
+ string totitle \udc24\ud824
+} \udc24\ud824
test utf-14.1 {Tcl_UtfNcasecmp} {
string compare -nocase a b