From eb3422673ff62a7427f2e6d166840fce68237d74 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 11 Jan 2018 14:11:38 +0000 Subject: Add test-cases for bug [11ae2be95dac9417], and make a start fixing it. Almost works. --- generic/tclCmdMZ.c | 12 ++++++++++-- generic/tclStringObj.c | 33 +++++++++++++++++++++++++++++++-- tests/string.test | 7 +++++++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index a206cc5..a23f007 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1435,7 +1435,7 @@ StringIndexCmd( } /* - * Get the char length to calulate what 'end' means. + * Get the char length to calculate what 'end' means. */ length = Tcl_GetCharLength(objv[1]); @@ -1444,7 +1444,15 @@ StringIndexCmd( } if ((index >= 0) && (index < length)) { - Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index); + int ch = Tcl_GetUniChar(objv[1], index); + + if (ch >= 0x10000) { + printf("HI: %x\n", ch); + } + if (ch == -1) { + printf("LO: %x\n", ch); + return TCL_OK; + } /* * If we have a ByteArray object, we're careful to generate a new diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 01f8d80..aa5aba3 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -460,7 +460,8 @@ Tcl_GetCharLength( * Tcl_GetUniChar -- * * Get the index'th Unicode character from the String object. The index - * is assumed to be in the appropriate range. + * is assumed to be in the appropriate range. If index references a lower + * surrogate preceded by a higher surrogate, the result = -1; * * Results: * Returns the index'th Unicode character in the Object. @@ -478,6 +479,7 @@ Tcl_GetUniChar( int index) /* Get the index'th Unicode character. */ { String *stringPtr; + int ch; /* * Optimize the case where we're really dealing with a bytearray object @@ -512,7 +514,23 @@ Tcl_GetUniChar( FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - return (int) stringPtr->unicode[index]; + + ch = stringPtr->unicode[index]; +#if TCL_UTF_MAX == 4 + /* See: bug [11ae2be95dac9417] */ + if ((ch&0xF800) == 0xD800) { + if (ch&0x400) { + if ((index > 0) && ((stringPtr->unicode[index-1]&0xFC00) == 0xD800)) { + ch = -1; /* low surrogate preceded by high surrogate */ + } + } else if ((++index < stringPtr->numChars) + && ((stringPtr->unicode[index]&0xFC00) == 0xDC00)) { + /* high surrogate followed by low surrogate */ + ch = (((ch & 0x3FF) << 10) | (stringPtr->unicode[index] & 0x3FF)) + 0x10000; + } + } +#endif + return ch; } /* @@ -656,6 +674,17 @@ Tcl_GetRange( stringPtr = GET_STRING(objPtr); } +#if TCL_UTF_MAX == 4 + /* See: bug [11ae2be95dac9417] */ + if ((first>0) && ((stringPtr->unicode[first]&0xFC00) == 0xDC00) + && ((stringPtr->unicode[first-1]&0xFC00) == 0xD800)) { + ++first; + } + if ((last+1numChars) && ((stringPtr->unicode[last+1]&0xFC00) == 0xDC00) + && ((stringPtr->unicode[last]&0xFC00) == 0xD800)) { + ++last; + } +#endif return Tcl_NewUnicodeObj(stringPtr->unicode + first, last-first+1); } diff --git a/tests/string.test b/tests/string.test index cebaf4c..58328bb 100644 --- a/tests/string.test +++ b/tests/string.test @@ -24,6 +24,7 @@ catch [list package require -exact Tcltest [info patchlevel]] testConstraint testobj [expr {[info commands testobj] != {}}] testConstraint testindexobj [expr {[info commands testindexobj] != {}}] +testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -302,6 +303,9 @@ test string-5.19 {string index, bytearray object out of bounds} { test string-5.20 {string index, bytearray object out of bounds} { string index [binary format I* {0x50515253 0x52}] 20 } {} +test string-5.21 {string index, surrogates, bug [11ae2be95dac9417]} fullutf { + list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3] +} [list \U100000 {} b] proc largest_int {} { @@ -1288,6 +1292,9 @@ test string-12.22 {string range, shimmering binary/index} { binary scan $s a* x string range $s $s end } 000000001 +test string-12.23 {string range, surrogates, bug [11ae2be95dac9417]} fullutf { + list [string range a\U100000b 1 1] [string range a\U100000b 2 2] [string range a\U100000b 3 3] +} [list \U100000 {} b] test string-13.1 {string repeat} { list [catch {string repeat} msg] $msg -- cgit v0.12