diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2018-06-18 15:59:19 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2018-06-18 15:59:19 (GMT) |
| commit | 5c87050dd8b6765c40eeef94ab5773d955c3de17 (patch) | |
| tree | 2f17fcdf4a7c6739c0fbe53952e68767c4657491 | |
| parent | 5af99f55b25d479c15bfcb63d6959a9611cd6d09 (diff) | |
| parent | 3c57d80efed172427e5aafa447365cb61439613c (diff) | |
| download | tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.zip tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.tar.gz tcl-5c87050dd8b6765c40eeef94ab5773d955c3de17.tar.bz2 | |
Merge 8.6. And add more documentation and test-cases regarding the behavior of Tcl_UniCharToUtf()
| -rw-r--r-- | doc/Utf.3 | 12 | ||||
| -rw-r--r-- | generic/tclEnv.c | 13 | ||||
| -rw-r--r-- | generic/tclParse.c | 10 | ||||
| -rw-r--r-- | generic/tclStringObj.c | 4 | ||||
| -rw-r--r-- | generic/tclUtf.c | 7 | ||||
| -rw-r--r-- | tests/pkgIndex.tcl | 8 | ||||
| -rw-r--r-- | tests/utf.test | 12 |
7 files changed, 49 insertions, 17 deletions
@@ -121,8 +121,8 @@ case-insensitive (1). .SH DESCRIPTION .PP -These routines convert between UTF-8 strings and Tcl_UniChars. A -Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size +These routines convert between UTF-8 strings and Unicode characters. An +Unicode character represented as an unsigned, fixed-size quantity. A UTF-8 character is a Unicode character represented as a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes. A multibyte UTF-8 sequence consists of a lead byte followed by some number of trail bytes. @@ -130,7 +130,7 @@ sequence consists of a lead byte followed by some number of trail bytes. \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to represent one Unicode character in the UTF-8 representation. .PP -\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string +\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string in starting at \fIbuf\fR. The return value is the number of bytes stored in \fIbuf\fR. .PP @@ -203,7 +203,7 @@ of \fIlength\fR bytes is long enough to be decoded by \fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee that the UTF-8 string is properly formed. This routine is used by procedures that are operating on a byte at a time and need to know if a -full Tcl_UniChar has been seen. +full Unicode character has been seen. .PP \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It returns the number of Tcl_UniChars that are represented by the UTF-8 string @@ -211,12 +211,12 @@ returns the number of Tcl_UniChars that are represented by the UTF-8 string length is negative, all bytes up to the first null byte are used. .PP \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It -returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR +returns a pointer to the first occurrence of the Unicode character \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is considered part of the UTF-8 string. .PP \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It -returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR +returns a pointer to the last occurrence of the Unicode character \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is considered part of the UTF-8 string. .PP diff --git a/generic/tclEnv.c b/generic/tclEnv.c index 8cc4b74..40ced17 100644 --- a/generic/tclEnv.c +++ b/generic/tclEnv.c @@ -723,14 +723,25 @@ TclFinalizeEnvironment(void) * strings. This may leak more memory that strictly necessary, since some * of the strings may no longer be in the environment. However, * determining which ones are ok to delete is n-squared, and is pretty - * unlikely, so we don't bother. + * unlikely, so we don't bother. However, in the case of DPURIFY, just + * free all strings in the cache. */ if (env.cache) { +#ifdef PURIFY + int i; + for (i = 0; i < env.cacheSize; i++) { + ckfree(env.cache[i]); + } +#endif ckfree(env.cache); env.cache = NULL; env.cacheSize = 0; #ifndef USE_PUTENV + if ((env.ourEnviron != NULL)) { + ckfree(env.ourEnviron); + env.ourEnviron = NULL; + } env.ourEnvironSize = 0; #endif } diff --git a/generic/tclParse.c b/generic/tclParse.c index 92482d6..581270c 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -979,13 +979,11 @@ TclParseBackslash( if (readPtr != NULL) { *readPtr = count; } - if ((result & 0xFC00) == 0xD800) { - dst[2] = (char) ((result | 0x80) & 0xBF); - dst[1] = (char) (((result >> 6) | 0x80) & 0xBF); - dst[0] = (char) ((result >> 12) | 0xE0); - return 3; + count = Tcl_UniCharToUtf(result, dst); + if (!count) { + count = Tcl_UniCharToUtf(-1, dst); } - return Tcl_UniCharToUtf(result, dst); + return count; } /* diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index f9ac8d7..bed23a0 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -2040,6 +2040,10 @@ Tcl_AppendFormatToObj( goto error; } length = Tcl_UniCharToUtf(code, buf); + if (!length) { + /* Special case for handling upper surrogates. */ + length = Tcl_UniCharToUtf(-1, buf); + } segment = Tcl_NewStringObj(buf, length); Tcl_IncrRefCount(segment); allocSegment = 1; diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 693e210..ed10ab2 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -173,6 +173,13 @@ Tcl_UniCharToUtf( buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } + } else if (ch == -1) { + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2) + + ((buf[2] & 0x30) >> 4); + goto three; + } } ch = 0xFFFD; diff --git a/tests/pkgIndex.tcl b/tests/pkgIndex.tcl index 48ab71b..0feb0eb 100644 --- a/tests/pkgIndex.tcl +++ b/tests/pkgIndex.tcl @@ -1,6 +1,6 @@ #! /usr/bin/env tclsh -package ifneeded tcltests 0.1 { - source [file dirname [file dirname [file normalize [info script]/...]]]/tcltests.tcl - package provide tcltests 0.1 -} +package ifneeded tcltests 0.1 " + source [list $dir]/tcltests.tcl + package provide tcltests 0.1 +" diff --git a/tests/utf.test b/tests/utf.test index 9dd8017..67a6778 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -44,6 +44,18 @@ test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints testbytestring -body { expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]} } -result 1 +test utf-1.8 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring { + expr {"\ud842" eq [testbytestring "\xed\xa1\x82"]} +} 1 +test utf-1.9 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring { + expr {"\udc42" eq [testbytestring "\xed\xb1\x82"]} +} 1 +test utf-1.10 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring { + expr {[format %c 0xd842] eq [testbytestring "\xed\xa1\x82"]} +} 1 +test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring { + expr {[format %c 0xdc42] eq [testbytestring "\xed\xb1\x82"]} +} 1 test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" |
