Merge 8.6. And add more documentation and test-cases regarding the behavior of Tcl_UniCharToUtf()

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2018-06-18 15:59:19 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2018-06-18 15:59:19 (GMT)
commit: 5c87050dd8b6765c40eeef94ab5773d955c3de17 (patch)
tree: 2f17fcdf4a7c6739c0fbe53952e68767c4657491
parent: 5af99f55b25d479c15bfcb63d6959a9611cd6d09 (diff)
parent: 3c57d80efed172427e5aafa447365cb61439613c (diff)
download: tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.zip
tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.tar.gz
tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.tar.bz2
7 files changed, 49 insertions, 17 deletions
diff --git a/doc/Utf.3 b/doc/Utf.3
index 78d795e..160575b 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -121,8 +121,8 @@ case-insensitive (1).
 
 .SH DESCRIPTION
 .PP
-These routines convert between UTF-8 strings and Tcl_UniChars.  A
-Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
+These routines convert between UTF-8 strings and Unicode characters.  An
+Unicode character represented as an unsigned, fixed-size
 quantity.  A UTF-8 character is a Unicode character represented as
 a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes.  A multibyte UTF-8
 sequence consists of a lead byte followed by some number of trail bytes.
@@ -130,7 +130,7 @@ sequence consists of a lead byte followed by some number of trail bytes.
 \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
 represent one Unicode character in the UTF-8 representation.
 .PP
-\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string
+\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string
 in starting at \fIbuf\fR.  The return value is the number of bytes stored
 in \fIbuf\fR.
 .PP
@@ -203,7 +203,7 @@ of \fIlength\fR bytes is long enough to be decoded by
 \fBTcl_UtfToUniChar\fR, or 0 otherwise.  This function does not guarantee
 that the UTF-8 string is properly formed.  This routine is used by
 procedures that are operating on a byte at a time and need to know if a
-full Tcl_UniChar has been seen.
+full Unicode character has been seen.
 .PP
 \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings.  It
 returns the number of Tcl_UniChars that are represented by the UTF-8 string
@@ -211,12 +211,12 @@ returns the number of Tcl_UniChars that are represented by the UTF-8 string
 length is negative, all bytes up to the first null byte are used.
 .PP
 \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings.  It
-returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR
+returns a pointer to the first occurrence of the Unicode character \fIch\fR
 in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
 considered part of the UTF-8 string.
 .PP
 \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings.  It
-returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
+returns a pointer to the last occurrence of the Unicode character \fIch\fR
 in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
 considered part of the UTF-8 string.
 .PP
diff --git a/generic/tclEnv.c b/generic/tclEnv.c
index 8cc4b74..40ced17 100644
--- a/generic/tclEnv.c
+++ b/generic/tclEnv.c
@@ -723,14 +723,25 @@ TclFinalizeEnvironment(void)
      * strings. This may leak more memory that strictly necessary, since some
      * of the strings may no longer be in the environment. However,
      * determining which ones are ok to delete is n-squared, and is pretty
-     * unlikely, so we don't bother.
+     * unlikely, so we don't bother.  However, in the case of DPURIFY, just
+     * free all strings in the cache.
      */
 
     if (env.cache) {
+#ifdef PURIFY
+	int i;
+	for (i = 0; i < env.cacheSize; i++) {
+	    ckfree(env.cache[i]);
+	}
+#endif
 	ckfree(env.cache);
 	env.cache = NULL;
 	env.cacheSize = 0;
 #ifndef USE_PUTENV
+	if ((env.ourEnviron != NULL)) {
+	    ckfree(env.ourEnviron);
+	    env.ourEnviron = NULL;
+	}
 	env.ourEnvironSize = 0;
 #endif
     }
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 92482d6..581270c 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -979,13 +979,11 @@ TclParseBackslash(
     if (readPtr != NULL) {
 	*readPtr = count;
     }
-    if ((result & 0xFC00) == 0xD800) {
-	dst[2] = (char) ((result | 0x80) & 0xBF);
-	dst[1] = (char) (((result >> 6) | 0x80) & 0xBF);
-	dst[0] = (char) ((result >> 12) | 0xE0);
-	return 3;
+    count = Tcl_UniCharToUtf(result, dst);
+    if (!count) {
+	count = Tcl_UniCharToUtf(-1, dst);
     }
-    return Tcl_UniCharToUtf(result, dst);
+    return count;
 }
 
 /*
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index f9ac8d7..bed23a0 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -2040,6 +2040,10 @@ Tcl_AppendFormatToObj(
 		goto error;
 	    }
 	    length = Tcl_UniCharToUtf(code, buf);
+	    if (!length) {
+		/* Special case for handling upper surrogates. */
+		length = Tcl_UniCharToUtf(-1, buf);
+	    }
 	    segment = Tcl_NewStringObj(buf, length);
 	    Tcl_IncrRefCount(segment);
 	    allocSegment = 1;
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 693e210..ed10ab2 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -173,6 +173,13 @@ Tcl_UniCharToUtf(
 	    buf[0] = (char) ((ch >> 18) | 0xF0);
 	    return 4;
 	}
+    } else if (ch == -1) {
+	if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+		&& ((buf[2] & 0xCF) == 0)) {
+	    ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2)
+		    + ((buf[2] & 0x30) >> 4);
+	    goto three;
+	}
     }
 
     ch = 0xFFFD;
diff --git a/tests/pkgIndex.tcl b/tests/pkgIndex.tcl
index 48ab71b..0feb0eb 100644
--- a/tests/pkgIndex.tcl
+++ b/tests/pkgIndex.tcl
@@ -1,6 +1,6 @@
 #! /usr/bin/env tclsh
 
-package ifneeded tcltests 0.1 {
-	source [file dirname [file dirname [file normalize [info script]/...]]]/tcltests.tcl
-	package provide tcltests 0.1
-}
+package ifneeded tcltests 0.1 "
+    source [list $dir]/tcltests.tcl
+    package provide tcltests 0.1
+"
diff --git a/tests/utf.test b/tests/utf.test
index 9dd8017..67a6778 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -44,6 +44,18 @@ test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring {
 test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints testbytestring -body {
     expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]}
 } -result 1
+test utf-1.8 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring {
+    expr {"\ud842" eq [testbytestring "\xed\xa1\x82"]}
+} 1
+test utf-1.9 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring {
+    expr {"\udc42" eq [testbytestring "\xed\xb1\x82"]}
+} 1
+test utf-1.10 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring {
+    expr {[format %c 0xd842] eq [testbytestring "\xed\xa1\x82"]}
+} 1
+test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring {
+    expr {[format %c 0xdc42] eq [testbytestring "\xed\xb1\x82"]}
+} 1
 
 test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
     string length "abc"
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2018-06-18 15:59:19 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2018-06-18 15:59:19 (GMT)
commit	5c87050dd8b6765c40eeef94ab5773d955c3de17 (patch)
tree	2f17fcdf4a7c6739c0fbe53952e68767c4657491
parent	5af99f55b25d479c15bfcb63d6959a9611cd6d09 (diff)
parent	3c57d80efed172427e5aafa447365cb61439613c (diff)
download	tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.zip tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.tar.gz tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.tar.bz2