From 48b529209c87473364215e8aef740e331f88415a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 7 Nov 2017 12:15:30 +0000 Subject: Somewhat simplified implementation of TIP #389, in which the "string length" if characters > U+FFFF is considered to be 2, not 1. --- doc/StringObj.3 | 2 +- doc/ToUpper.3 | 2 +- doc/Utf.3 | 2 +- generic/tcl.decls | 10 +++--- generic/tcl.h | 2 +- generic/tclCmdMZ.c | 18 ++++++++-- generic/tclDecls.h | 20 +++++------ generic/tclScan.c | 12 +++++-- generic/tclStringObj.c | 12 +++---- generic/tclUtf.c | 95 +++++++++++++++++++++++++++++++++++--------------- tests/string.test | 26 +++++++------- tests/utf.test | 4 +-- 12 files changed, 132 insertions(+), 73 deletions(-) diff --git a/doc/StringObj.3 b/doc/StringObj.3 index 7042cc8..8d9bb56 100644 --- a/doc/StringObj.3 +++ b/doc/StringObj.3 @@ -37,7 +37,7 @@ Tcl_UniChar * Tcl_UniChar * \fBTcl_GetUnicode\fR(\fIobjPtr\fR) .sp -Tcl_UniChar +int \fBTcl_GetUniChar\fR(\fIobjPtr, index\fR) .sp int diff --git a/doc/ToUpper.3 b/doc/ToUpper.3 index b933e9c..14766da 100644 --- a/doc/ToUpper.3 +++ b/doc/ToUpper.3 @@ -13,7 +13,7 @@ Tcl_UniCharToUpper, Tcl_UniCharToLower, Tcl_UniCharToTitle, Tcl_UtfToUpper, Tcl_ .nf \fB#include \fR .sp -Tcl_UniChar +int \fBTcl_UniCharToUpper\fR(\fIch\fR) .sp Tcl_UniChar diff --git a/doc/Utf.3 b/doc/Utf.3 index 378c806..5cd6b7df 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -63,7 +63,7 @@ const char * const char * \fBTcl_UtfPrev\fR(\fIsrc, start\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) .sp const char * diff --git a/generic/tcl.decls b/generic/tcl.decls index b2b91a9..e3ea9bc 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1148,16 +1148,16 @@ declare 319 { Tcl_QueuePosition position) } declare 320 { - Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index) + int Tcl_UniCharAtIndex(const char *src, int index) } declare 321 { - Tcl_UniChar Tcl_UniCharToLower(int ch) + int Tcl_UniCharToLower(int ch) } declare 322 { - Tcl_UniChar Tcl_UniCharToTitle(int ch) + int Tcl_UniCharToTitle(int ch) } declare 323 { - Tcl_UniChar Tcl_UniCharToUpper(int ch) + int Tcl_UniCharToUpper(int ch) } declare 324 { int Tcl_UniCharToUtf(int ch, char *buf) @@ -1351,7 +1351,7 @@ declare 380 { int Tcl_GetCharLength(Tcl_Obj *objPtr) } declare 381 { - Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index) + int Tcl_GetUniChar(Tcl_Obj *objPtr, int index) } declare 382 { Tcl_UniChar *Tcl_GetUnicode(Tcl_Obj *objPtr) diff --git a/generic/tcl.h b/generic/tcl.h index 07d841d..f874997 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2201,7 +2201,7 @@ typedef struct Tcl_EncodingType { */ #ifndef TCL_UTF_MAX -#define TCL_UTF_MAX 3 +#define TCL_UTF_MAX 4 #endif /* diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 2195aa1..b6a8fe9 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -309,7 +309,7 @@ Tcl_RegexpObjCmd( eflags = 0; } else if (offset > stringLength) { eflags = TCL_REG_NOTBOL; - } else if (Tcl_GetUniChar(objPtr, offset-1) == (Tcl_UniChar)'\n') { + } else if (Tcl_GetUniChar(objPtr, offset-1) == '\n') { eflags = 0; } else { eflags = TCL_REG_NOTBOL; @@ -1218,6 +1218,12 @@ Tcl_SplitObjCmd( for ( ; stringPtr < end; stringPtr += len) { len = TclUtfToUniChar(stringPtr, &ch); +#if TCL_UTF_MAX == 4 + if (!len) { + continue; + } +#endif + /* * Assume Tcl_UniChar is an integral type... */ @@ -1814,8 +1820,16 @@ StringIsCmd( } end = string1 + length1; for (; string1 < end; string1 += length2, failat++) { + int fullchar; length2 = TclUtfToUniChar(string1, &ch); - if (!chcomp(ch)) { + fullchar = ch; +#if TCL_UTF_MAX == 4 + if (!length2) { + length2 = TclUtfToUniChar(string1, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + if (!chcomp(fullchar)) { result = 0; break; } diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 464fc0f..5f83636 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -959,13 +959,13 @@ EXTERN void Tcl_ThreadAlert(Tcl_ThreadId threadId); EXTERN void Tcl_ThreadQueueEvent(Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 320 */ -EXTERN Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index); +EXTERN int Tcl_UniCharAtIndex(const char *src, int index); /* 321 */ -EXTERN Tcl_UniChar Tcl_UniCharToLower(int ch); +EXTERN int Tcl_UniCharToLower(int ch); /* 322 */ -EXTERN Tcl_UniChar Tcl_UniCharToTitle(int ch); +EXTERN int Tcl_UniCharToTitle(int ch); /* 323 */ -EXTERN Tcl_UniChar Tcl_UniCharToUpper(int ch); +EXTERN int Tcl_UniCharToUpper(int ch); /* 324 */ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ @@ -1117,7 +1117,7 @@ EXTERN void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, /* 380 */ EXTERN int Tcl_GetCharLength(Tcl_Obj *objPtr); /* 381 */ -EXTERN Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index); +EXTERN int Tcl_GetUniChar(Tcl_Obj *objPtr, int index); /* 382 */ EXTERN Tcl_UniChar * Tcl_GetUnicode(Tcl_Obj *objPtr); /* 383 */ @@ -2186,10 +2186,10 @@ typedef struct TclStubs { Tcl_Obj * (*tcl_SetVar2Ex) (Tcl_Interp *interp, const char *part1, const char *part2, Tcl_Obj *newValuePtr, int flags); /* 317 */ void (*tcl_ThreadAlert) (Tcl_ThreadId threadId); /* 318 */ void (*tcl_ThreadQueueEvent) (Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 319 */ - Tcl_UniChar (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ - Tcl_UniChar (*tcl_UniCharToLower) (int ch); /* 321 */ - Tcl_UniChar (*tcl_UniCharToTitle) (int ch); /* 322 */ - Tcl_UniChar (*tcl_UniCharToUpper) (int ch); /* 323 */ + int (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ + int (*tcl_UniCharToLower) (int ch); /* 321 */ + int (*tcl_UniCharToTitle) (int ch); /* 322 */ + int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ CONST84_RETURN char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ @@ -2247,7 +2247,7 @@ typedef struct TclStubs { Tcl_Obj * (*tcl_NewUnicodeObj) (const Tcl_UniChar *unicode, int numChars); /* 378 */ void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 379 */ int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */ - Tcl_UniChar (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ + int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ Tcl_UniChar * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */ void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */ diff --git a/generic/tclScan.c b/generic/tclScan.c index e1fcad4..7f71262 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -885,9 +885,17 @@ Tcl_ScanObjCmd( * Scan a single Unicode character. */ - string += TclUtfToUniChar(string, &sch); + offset = TclUtfToUniChar(string, &sch); + i = (int)sch; +#if TCL_UTF_MAX == 4 + if (!offset) { + offset = Tcl_UtfToUniChar(string, &sch); + i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); + } +#endif + string += offset; if (!(flags & SCAN_SUPPRESS)) { - objPtr = Tcl_NewIntObj((int)sch); + objPtr = Tcl_NewIntObj(i); Tcl_IncrRefCount(objPtr); CLANG_ASSERT(objs); objs[objIndex++] = objPtr; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 3a35bcf..1ccd778 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -466,7 +466,7 @@ Tcl_GetCharLength( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_GetUniChar( Tcl_Obj *objPtr, /* The object to get the Unicode charater * from. */ @@ -483,7 +483,7 @@ Tcl_GetUniChar( if (TclIsPureByteArray(objPtr)) { unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL); - return (Tcl_UniChar) bytes[index]; + return (int) bytes[index]; } /* @@ -507,7 +507,7 @@ Tcl_GetUniChar( FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - return stringPtr->unicode[index]; + return (int) stringPtr->unicode[index]; } /* @@ -3462,7 +3462,6 @@ TclStringObjReverse( * Tcl_SetObjLength into growing the unicode rep buffer. */ - ch = 0; objPtr = Tcl_NewUnicodeObj(&ch, 1); Tcl_SetObjLength(objPtr, stringPtr->numChars); to = Tcl_GetUnicode(objPtr); @@ -3565,7 +3564,7 @@ ExtendUnicodeRepWithString( { String *stringPtr = GET_STRING(objPtr); int needed, numOrigChars = 0; - Tcl_UniChar *dst; + Tcl_UniChar *dst, unichar = 0; if (stringPtr->hasUnicode) { numOrigChars = stringPtr->numChars; @@ -3588,7 +3587,8 @@ ExtendUnicodeRepWithString( numAppendChars = 0; } for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) { - bytes += TclUtfToUniChar(bytes, dst); + bytes += TclUtfToUniChar(bytes, &unichar); + *dst = unichar; } *dst = 0; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 25cc2d1..859fe78 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -699,7 +699,7 @@ Tcl_UtfPrev( *--------------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharAtIndex( register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ @@ -819,7 +819,8 @@ int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, upChar; + Tcl_UniChar ch = 0; + int upChar; char *src, *dst; int bytes; @@ -830,7 +831,14 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); + upChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + upChar = Tcl_UniCharToUpper(upChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -872,7 +880,8 @@ int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, lowChar; + Tcl_UniChar ch = 0; + int lowChar; char *src, *dst; int bytes; @@ -883,7 +892,14 @@ Tcl_UtfToLower( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -926,7 +942,8 @@ int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, titleChar, lowChar; + Tcl_UniChar ch = 0; + int titleChar, lowChar; char *src, *dst; int bytes; @@ -939,7 +956,14 @@ Tcl_UtfToTitle( if (*src) { bytes = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); + titleChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + titleChar = Tcl_UniCharToTitle(titleChar); if (bytes < TclUtfCount(titleChar)) { memcpy(dst, src, (size_t) bytes); @@ -951,7 +975,14 @@ Tcl_UtfToTitle( } while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); if (bytes < TclUtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); @@ -1159,16 +1190,18 @@ TclUtfCasecmp( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToUpper( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); + if (GetCaseType(info) & 0x04) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1187,16 +1220,18 @@ Tcl_UniCharToUpper( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x02) { - ch += GetDelta(info); + if (GetCaseType(info) & 0x02) { + ch += GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1215,23 +1250,25 @@ Tcl_UniCharToLower( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToTitle( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ - ch += ((mode & 0x4) ? -1 : 1); - } else if (mode == 0x4) { - ch -= GetDelta(info); + ch += ((mode & 0x4) ? -1 : 1); + } else if (mode == 0x4) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* diff --git a/tests/string.test b/tests/string.test index cb901b9..cebaf4c 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1697,40 +1697,40 @@ test string-24.4 {string reverse command - unshared string} { string reverse $x$y } edcba test string-24.5 {string reverse command - shared unicode string} { - set x abcde\udead + set x abcde\ud0ad string reverse $x -} \udeadedcba +} \ud0adedcba test string-24.6 {string reverse command - unshared string} { set x abc - set y de\udead + set y de\ud0ad string reverse $x$y -} \udeadedcba +} \ud0adedcba test string-24.7 {string reverse command - simple case} { string reverse a } a test string-24.8 {string reverse command - simple case} { - string reverse \udead -} \udead + string reverse \ud0ad +} \ud0ad test string-24.9 {string reverse command - simple case} { string reverse {} } {} test string-24.10 {string reverse command - corner case} { - set x \ubeef\udead + set x \ubeef\ud0ad string reverse $x -} \udead\ubeef +} \ud0ad\ubeef test string-24.11 {string reverse command - corner case} { set x \ubeef - set y \udead + set y \ud0ad string reverse $x$y -} \udead\ubeef +} \ud0ad\ubeef test string-24.12 {string reverse command - corner case} { set x \ubeef - set y \udead + set y \ud0ad string is ascii [string reverse $x$y] } 0 test string-24.13 {string reverse command - pure Unicode string} { - string reverse [string range \ubeef\udead\ubeef\udead\ubeef\udead 1 5] -} \udead\ubeef\udead\ubeef\udead + string reverse [string range \ubeef\ud0ad\ubeef\ud0ad\ubeef\ud0ad 1 5] +} \ud0ad\ubeef\ud0ad\ubeef\ud0ad test string-24.14 {string reverse command - pure bytearray} { binary scan [string reverse [binary format H* 010203]] H* x set x diff --git a/tests/utf.test b/tests/utf.test index 422ab08..45f9c0c 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -68,10 +68,10 @@ test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestrin } {1} test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body { string length [testbytestring "\xF0\x90\x80\x80"] -} -result {1} +} -result {2} test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] -} -result {1} +} -result {2} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} -- cgit v0.12 From 2ea8f0fe98ed8a2f72d7d355b9c080fbb5bdd912 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 17 Nov 2017 16:08:49 +0000 Subject: merge core-8-branch. Fix some Tcl_UniChar initialization, in case TCL_UTF_MAX == 4 --- doc/ToUpper.3 | 6 +++--- doc/UniCharIsAlpha.3 | 7 ++----- doc/Utf.3 | 2 +- generic/tclScan.c | 2 +- generic/tclStringObj.c | 10 +++++----- generic/tclUtf.c | 34 ++++++++++++++++++++++++---------- generic/tclUtil.c | 14 +++++++------- win/tclWinSerial.c | 6 +++--- 8 files changed, 46 insertions(+), 35 deletions(-) diff --git a/doc/ToUpper.3 b/doc/ToUpper.3 index 14766da..b06b793 100644 --- a/doc/ToUpper.3 +++ b/doc/ToUpper.3 @@ -16,10 +16,10 @@ Tcl_UniCharToUpper, Tcl_UniCharToLower, Tcl_UniCharToTitle, Tcl_UtfToUpper, Tcl_ int \fBTcl_UniCharToUpper\fR(\fIch\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharToLower\fR(\fIch\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharToTitle\fR(\fIch\fR) .sp int @@ -33,7 +33,7 @@ int .SH ARGUMENTS .AS char *str in/out .AP int ch in -The Tcl_UniChar to be converted. +The character to be converted. .AP char *str in/out Pointer to UTF-8 string to be converted in place. .BE diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3 index 2336c34..e1d23ab 100644 --- a/doc/UniCharIsAlpha.3 +++ b/doc/UniCharIsAlpha.3 @@ -48,19 +48,16 @@ int .SH ARGUMENTS .AS int ch .AP int ch in -The Tcl_UniChar to be examined. +The character to be examined. .BE .SH DESCRIPTION .PP -All of the routines described examine Tcl_UniChars and return a +All of the routines described examine characters and return a boolean value. A non-zero return value means that the character does belong to the character class associated with the called routine. The rest of this document just describes the character classes associated with the various routines. -.PP -Note: A Tcl_UniChar is a Unicode character represented as an unsigned, -fixed-size quantity. .SH "CHARACTER CLASSES" .PP diff --git a/doc/Utf.3 b/doc/Utf.3 index 5cd6b7df..638f349 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -77,7 +77,7 @@ int Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. .AP int ch in -The Tcl_UniChar to be converted or examined. +The character to be converted or examined. .AP Tcl_UniChar *chPtr out Filled with the Tcl_UniChar represented by the head of the UTF-8 string. .AP "const char" *src in diff --git a/generic/tclScan.c b/generic/tclScan.c index 7f71262..e0798df 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -889,7 +889,7 @@ Tcl_ScanObjCmd( i = (int)sch; #if TCL_UTF_MAX == 4 if (!offset) { - offset = Tcl_UtfToUniChar(string, &sch); + offset = TclUtfToUniChar(string, &sch); i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); } #endif diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 1ccd778..fda6ac1 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1869,20 +1869,20 @@ Tcl_AppendFormatToObj( } else if (ch == 'I') { if ((format[1] == '6') && (format[2] == '4')) { format += (step + 2); - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG useWide = 1; #endif } else if ((format[1] == '3') && (format[2] == '2')) { format += (step + 2); - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } else { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } } else if ((ch == 't') || (ch == 'z')) { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG if (sizeof(size_t) > sizeof(int)) { useWide = 1; @@ -1890,7 +1890,7 @@ Tcl_AppendFormatToObj( #endif } else if ((ch == 'q') ||(ch == 'j')) { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG useWide = 1; #endif diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 859fe78..e651757 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -398,7 +398,7 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar ch, *w, *wString; + Tcl_UniChar ch = 0, *w, *wString; const char *p, *end; int oldLength; @@ -522,12 +522,12 @@ Tcl_NumUtfChars( * * Tcl_UtfFindFirst -- * - * Returns a pointer to the first occurance of the given Tcl_UniChar in + * Returns a pointer to the first occurance of the given character in * the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the + * As above. If the character does not exist in the given string, the * return value is NULL. * * Side effects: @@ -539,14 +539,21 @@ Tcl_NumUtfChars( const char * Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(stringPtr, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + if (find == fullchar) { return src; } if (*src == '\0') { @@ -578,16 +585,23 @@ Tcl_UtfFindFirst( const char * Tcl_UtfFindLast( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; const char *last; last = NULL; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(stringPtr, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + if (find == fullchar) { last = src; } if (*src == '\0') { @@ -1158,7 +1172,7 @@ TclUtfCasecmp( const char *ct) /* UTF string cs is compared to. */ { while (*cs && *ct) { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 8ebace5..21d1071 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1695,7 +1695,7 @@ TclTrimRight( */ do { - Tcl_UniChar ch2; + Tcl_UniChar ch2 = 0; int qInc = TclUtfToUniChar(q, &ch2); if (ch1 == ch2) { @@ -1763,7 +1763,7 @@ TclTrimLeft( */ do { - Tcl_UniChar ch1; + Tcl_UniChar ch1 = 0; int pInc = TclUtfToUniChar(p, &ch1); const char *q = trim; int bytesLeft = numTrim; @@ -1773,7 +1773,7 @@ TclTrimLeft( */ do { - Tcl_UniChar ch2; + Tcl_UniChar ch2 = 0; int qInc = TclUtfToUniChar(q, &ch2); if (ch1 == ch2) { @@ -2107,7 +2107,7 @@ Tcl_StringCaseMatch( { int p, charLen; const char *pstart = pattern; - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; while (1) { p = *pattern; @@ -2217,7 +2217,7 @@ Tcl_StringCaseMatch( */ if (p == '[') { - Tcl_UniChar startChar, endChar; + Tcl_UniChar startChar = 0, endChar = 0; pattern++; if (UCHAR(*str) < 0x80) { @@ -2225,7 +2225,7 @@ Tcl_StringCaseMatch( (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); str++; } else { - str += Tcl_UtfToUniChar(str, &ch1); + str += TclUtfToUniChar(str, &ch1); if (nocase) { ch1 = Tcl_UniCharToLower(ch1); } @@ -2254,7 +2254,7 @@ Tcl_StringCaseMatch( ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); pattern++; } else { - pattern += Tcl_UtfToUniChar(pattern, &endChar); + pattern += TclUtfToUniChar(pattern, &endChar); if (nocase) { endChar = Tcl_UniCharToLower(endChar); } diff --git a/win/tclWinSerial.c b/win/tclWinSerial.c index ed1a8e5..894f431 100644 --- a/win/tclWinSerial.c +++ b/win/tclWinSerial.c @@ -1738,15 +1738,15 @@ SerialSetOptionProc( dcb.XonChar = argv[0][0]; dcb.XoffChar = argv[1][0]; if (argv[0][0] & 0x80 || argv[1][0] & 0x80) { - Tcl_UniChar character; + Tcl_UniChar character = 0; int charLen; - charLen = Tcl_UtfToUniChar(argv[0], &character); + charLen = TclUtfToUniChar(argv[0], &character); if (argv[0][charLen]) { goto badXchar; } dcb.XonChar = (char) character; - charLen = Tcl_UtfToUniChar(argv[1], &character); + charLen = TclUtfToUniChar(argv[1], &character); if (argv[1][charLen]) { goto badXchar; } -- cgit v0.12 From 61536390c542d9aa9d1a91d173190cd2421294e5 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 20 Nov 2017 13:05:36 +0000 Subject: Fix executable flags --- generic/tclDecls.h | 0 generic/tclIntDecls.h | 0 generic/tclStubInit.c | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 generic/tclDecls.h mode change 100755 => 100644 generic/tclIntDecls.h mode change 100755 => 100644 generic/tclStubInit.c diff --git a/generic/tclDecls.h b/generic/tclDecls.h old mode 100755 new mode 100644 diff --git a/generic/tclIntDecls.h b/generic/tclIntDecls.h old mode 100755 new mode 100644 diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c old mode 100755 new mode 100644 -- cgit v0.12 From 3af16acbcb63ea2935d71b905371252560dc4659 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 29 Nov 2017 08:59:49 +0000 Subject: Treat invalid UTF-8 characters in the range 0x80-0x9F as cp1252: See [https://en.wikipedia.org/wiki/UTF-8]. To be added to TIP #389 --- doc/Utf.3 | 3 +++ generic/tclInt.h | 2 +- generic/tclUtf.c | 17 +++++++++++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 638f349..de9545d 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -140,6 +140,9 @@ number of bytes read from \fIsrc\fR. The caller must ensure that the source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is +a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the +cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR +and returns 1. If the input is otherwise not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and 0x00ff and return 1. diff --git a/generic/tclInt.h b/generic/tclInt.h index ef88bf5..d77889e 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -4446,7 +4446,7 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, */ #define TclUtfToUniChar(str, chPtr) \ - ((((unsigned char) *(str)) < 0xC0) ? \ + ((((unsigned char) *(str)) < 0x80) ? \ ((*(chPtr) = (unsigned char) *(str)), 1) \ : Tcl_UtfToUniChar(str, chPtr)) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4ed201f..aed332f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -272,6 +272,13 @@ Tcl_UniCharToUtfDString( *--------------------------------------------------------------------------- */ +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; + int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ @@ -288,11 +295,17 @@ Tcl_UtfToUniChar( if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid * characters representing themselves. */ - *chPtr = (Tcl_UniChar) byte; + if ((unsigned)(byte-0x80) < (unsigned) 0x20) { + *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; + } else { + *chPtr = (Tcl_UniChar) byte; + } return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { -- cgit v0.12 From 03c66864aa2ffa9871ce216b00cd661eaf1be688 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 29 Nov 2017 09:49:31 +0000 Subject: Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort treats NUL chars strangely --- generic/tclCmdIL.c | 4 +-- generic/tclCmdMZ.c | 2 +- generic/tclInt.h | 1 + generic/tclUtf.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 80 insertions(+), 6 deletions(-) diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c index 47076ec..b41d312 100644 --- a/generic/tclCmdIL.c +++ b/generic/tclCmdIL.c @@ -2945,7 +2945,7 @@ Tcl_LsearchObjCmd( double patDouble, objDouble; SortInfo sortInfo; Tcl_Obj *patObj, **listv, *listPtr, *startPtr, *itemPtr; - SortStrCmpFn_t strCmpFn = strcmp; + SortStrCmpFn_t strCmpFn = TclUtfCmp; Tcl_RegExp regexp = NULL; static const char *const options[] = { "-all", "-ascii", "-bisect", "-decreasing", "-dictionary", @@ -4263,7 +4263,7 @@ SortCompare( int order = 0; if (infoPtr->sortMode == SORTMODE_ASCII) { - order = strcmp(elemPtr1->collationKey.strValuePtr, + order = TclUtfCmp(elemPtr1->collationKey.strValuePtr, elemPtr2->collationKey.strValuePtr); } else if (infoPtr->sortMode == SORTMODE_ASCII_NC) { order = TclUtfCasecmp(elemPtr1->collationKey.strValuePtr, diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index ad1dd5f..a206cc5 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -3547,7 +3547,7 @@ TclNRSwitchObjCmd( OPT_LAST }; typedef int (*strCmpFn_t)(const char *, const char *); - strCmpFn_t strCmpFn = strcmp; + strCmpFn_t strCmpFn = TclUtfCmp; mode = OPT_EXACT; foundmode = 0; diff --git a/generic/tclInt.h b/generic/tclInt.h index d77889e..ad1d9c6 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3219,6 +3219,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); +MODULE_SCOPE int TclUtfCmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index aed332f..aff10c1 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1108,6 +1108,15 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { return (ch1 - ch2); } @@ -1140,6 +1149,7 @@ Tcl_UtfNcasecmp( unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1 = 0, ch2 = 0; + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1148,6 +1158,15 @@ Tcl_UtfNcasecmp( */ cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); @@ -1158,11 +1177,56 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif + if (ch1 != ch2) { + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- * - * Tcl_UtfNcasecmp -- + * TclUtfCasecmp -- * * Compare UTF chars of string cs to string ct case insensitively. * Replacement for strcasecmp in Tcl core, in places where UTF-8 should @@ -1182,11 +1246,20 @@ TclUtfCasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct) /* UTF string cs is compared to. */ { - while (*cs && *ct) { - Tcl_UniChar ch1 = 0, ch2 = 0; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); -- cgit v0.12 From 04d3db559246ee9e2ac2a5e20e52cf57b7af808b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 10 Jan 2018 09:32:51 +0000 Subject: Don't use TclUtfToUniChar here: it doesn't give any advantage over Tcl_UtfToUniChar --- generic/tclUtil.c | 4 ++-- win/tclWinSerial.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 41795e8..15018de 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -2223,7 +2223,7 @@ Tcl_StringCaseMatch( (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); str++; } else { - str += TclUtfToUniChar(str, &ch1); + str += Tcl_UtfToUniChar(str, &ch1); if (nocase) { ch1 = Tcl_UniCharToLower(ch1); } @@ -2252,7 +2252,7 @@ Tcl_StringCaseMatch( ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); pattern++; } else { - pattern += TclUtfToUniChar(pattern, &endChar); + pattern += Tcl_UtfToUniChar(pattern, &endChar); if (nocase) { endChar = Tcl_UniCharToLower(endChar); } diff --git a/win/tclWinSerial.c b/win/tclWinSerial.c index 894f431..acfeecb 100644 --- a/win/tclWinSerial.c +++ b/win/tclWinSerial.c @@ -1741,12 +1741,12 @@ SerialSetOptionProc( Tcl_UniChar character = 0; int charLen; - charLen = TclUtfToUniChar(argv[0], &character); + charLen = Tcl_UtfToUniChar(argv[0], &character); if (argv[0][charLen]) { goto badXchar; } dcb.XonChar = (char) character; - charLen = TclUtfToUniChar(argv[1], &character); + charLen = Tcl_UtfToUniChar(argv[1], &character); if (argv[1][charLen]) { goto badXchar; } -- cgit v0.12 From eb3422673ff62a7427f2e6d166840fce68237d74 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 11 Jan 2018 14:11:38 +0000 Subject: Add test-cases for bug [11ae2be95dac9417], and make a start fixing it. Almost works. --- generic/tclCmdMZ.c | 12 ++++++++++-- generic/tclStringObj.c | 33 +++++++++++++++++++++++++++++++-- tests/string.test | 7 +++++++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index a206cc5..a23f007 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1435,7 +1435,7 @@ StringIndexCmd( } /* - * Get the char length to calulate what 'end' means. + * Get the char length to calculate what 'end' means. */ length = Tcl_GetCharLength(objv[1]); @@ -1444,7 +1444,15 @@ StringIndexCmd( } if ((index >= 0) && (index < length)) { - Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index); + int ch = Tcl_GetUniChar(objv[1], index); + + if (ch >= 0x10000) { + printf("HI: %x\n", ch); + } + if (ch == -1) { + printf("LO: %x\n", ch); + return TCL_OK; + } /* * If we have a ByteArray object, we're careful to generate a new diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 01f8d80..aa5aba3 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -460,7 +460,8 @@ Tcl_GetCharLength( * Tcl_GetUniChar -- * * Get the index'th Unicode character from the String object. The index - * is assumed to be in the appropriate range. + * is assumed to be in the appropriate range. If index references a lower + * surrogate preceded by a higher surrogate, the result = -1; * * Results: * Returns the index'th Unicode character in the Object. @@ -478,6 +479,7 @@ Tcl_GetUniChar( int index) /* Get the index'th Unicode character. */ { String *stringPtr; + int ch; /* * Optimize the case where we're really dealing with a bytearray object @@ -512,7 +514,23 @@ Tcl_GetUniChar( FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - return (int) stringPtr->unicode[index]; + + ch = stringPtr->unicode[index]; +#if TCL_UTF_MAX == 4 + /* See: bug [11ae2be95dac9417] */ + if ((ch&0xF800) == 0xD800) { + if (ch&0x400) { + if ((index > 0) && ((stringPtr->unicode[index-1]&0xFC00) == 0xD800)) { + ch = -1; /* low surrogate preceded by high surrogate */ + } + } else if ((++index < stringPtr->numChars) + && ((stringPtr->unicode[index]&0xFC00) == 0xDC00)) { + /* high surrogate followed by low surrogate */ + ch = (((ch & 0x3FF) << 10) | (stringPtr->unicode[index] & 0x3FF)) + 0x10000; + } + } +#endif + return ch; } /* @@ -656,6 +674,17 @@ Tcl_GetRange( stringPtr = GET_STRING(objPtr); } +#if TCL_UTF_MAX == 4 + /* See: bug [11ae2be95dac9417] */ + if ((first>0) && ((stringPtr->unicode[first]&0xFC00) == 0xDC00) + && ((stringPtr->unicode[first-1]&0xFC00) == 0xD800)) { + ++first; + } + if ((last+1numChars) && ((stringPtr->unicode[last+1]&0xFC00) == 0xDC00) + && ((stringPtr->unicode[last]&0xFC00) == 0xD800)) { + ++last; + } +#endif return Tcl_NewUnicodeObj(stringPtr->unicode + first, last-first+1); } diff --git a/tests/string.test b/tests/string.test index cebaf4c..58328bb 100644 --- a/tests/string.test +++ b/tests/string.test @@ -24,6 +24,7 @@ catch [list package require -exact Tcltest [info patchlevel]] testConstraint testobj [expr {[info commands testobj] != {}}] testConstraint testindexobj [expr {[info commands testindexobj] != {}}] +testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -302,6 +303,9 @@ test string-5.19 {string index, bytearray object out of bounds} { test string-5.20 {string index, bytearray object out of bounds} { string index [binary format I* {0x50515253 0x52}] 20 } {} +test string-5.21 {string index, surrogates, bug [11ae2be95dac9417]} fullutf { + list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3] +} [list \U100000 {} b] proc largest_int {} { @@ -1288,6 +1292,9 @@ test string-12.22 {string range, shimmering binary/index} { binary scan $s a* x string range $s $s end } 000000001 +test string-12.23 {string range, surrogates, bug [11ae2be95dac9417]} fullutf { + list [string range a\U100000b 1 1] [string range a\U100000b 2 2] [string range a\U100000b 3 3] +} [list \U100000 {} b] test string-13.1 {string repeat} { list [catch {string repeat} msg] $msg -- cgit v0.12 From c8219a9a2995c5658cd709f4bb7b5b933e6575e9 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 12 Jan 2018 10:03:58 +0000 Subject: Fix [11ae2be95d]: tip-389 branch: string range errors with code points greater than U+FFFF --- generic/tclExecute.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/tclExecute.c b/generic/tclExecute.c index f2cda0c..63281a8 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5445,7 +5445,7 @@ TEBCresume( valuePtr->bytes+index, 1); } else { char buf[TCL_UTF_MAX]; - Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index); + int ch = Tcl_GetUniChar(valuePtr, index); /* * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) @@ -5453,7 +5453,7 @@ TEBCresume( * practical use. */ - length = Tcl_UniCharToUtf(ch, buf); + length = (ch != -1) ? Tcl_UniCharToUtf(ch, buf) : 0; objResultPtr = Tcl_NewStringObj(buf, length); } -- cgit v0.12 From 28ac08663306381af8f310b247bec60e5ed694db Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 17 Apr 2018 21:49:00 +0000 Subject: Slightly better unmatched-surrogates handling. Unmatched High surrogates will still be silently removed, but Unmatched Low surrogates will pass through as-is now. Inspired by Kevin Kenny's remarks. Thanks! --- generic/tclUtf.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 923b1f8..ab4e142 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -148,15 +148,22 @@ Tcl_UniCharToUtf( if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a High surrogate, so combine */ + buf[3] = (char) ((ch & 0x3F) | 0x80); + buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 4; + } + /* Previous Tcl_UniChar was not a High surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + /* Fill buffer with specific 3-byte (invalid) byte combination, + so following Low surrogate can recognize it and combine */ + buf[2] = (char) ((ch << 4) & 0x30); + buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); + buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); return 0; } } -- cgit v0.12 From ddb7bbe66c5b805d02fc11f55c53e909e1af45ac Mon Sep 17 00:00:00 2001 From: oehhar Date: Thu, 19 Apr 2018 11:48:28 +0000 Subject: correct msgcat test numbering for section util from 15.x (used twice) to 18.x --- tests/msgcat.test | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/msgcat.test b/tests/msgcat.test index d38152b..12030fb 100644 --- a/tests/msgcat.test +++ b/tests/msgcat.test @@ -1317,33 +1317,33 @@ if {[package vsatisfies [package provide msgcat] 1.7]} { interp bgerror {} $bgerrorsaved - # Tests msgcat-15.*: [mcutil] + # Tests msgcat-18.*: [mcutil] - test msgcat-15.5 {mcutil - no argument} -body { + test msgcat-18.1 {mcutil - no argument} -body { mcutil } -returnCodes 1\ -result {wrong # args: should be "mcutil subcommand ?arg ...?"} - test msgcat-15.6 {mcutil - wrong argument} -body { + test msgcat-18.2 {mcutil - wrong argument} -body { mcutil junk } -returnCodes 1\ -result {unknown subcommand "junk": must be getpreferences, or getsystemlocale} - test msgcat-15.7 {mcutil - partial argument} -body { + test msgcat-18.3 {mcutil - partial argument} -body { mcutil getsystem } -returnCodes 1\ -result {unknown subcommand "getsystem": must be getpreferences, or getsystemlocale} - test msgcat-15.8 {mcutil getpreferences - no argument} -body { + test msgcat-18.4 {mcutil getpreferences - no argument} -body { mcutil getpreferences } -returnCodes 1\ -result {wrong # args: should be "mcutil getpreferences locale"} - test msgcat-15.9 {mcutil getpreferences - DE_de} -body { + test msgcat-18.5 {mcutil getpreferences - DE_de} -body { mcutil getpreferences DE_de } -result {de_de de {}} - test msgcat-15.10 {mcutil getsystemlocale - wrong argument} -body { + test msgcat-18.6 {mcutil getsystemlocale - wrong argument} -body { mcutil getsystemlocale DE_de } -returnCodes 1\ -result {wrong # args: should be "mcutil getsystemlocale"} @@ -1351,7 +1351,7 @@ if {[package vsatisfies [package provide msgcat] 1.7]} { # The result is system dependent # So just test if it runs # The environment variable version was test with test 0.x - test msgcat-15.11 {mcutil getsystemlocale} -body { + test msgcat-18.7 {mcutil getsystemlocale} -body { mcutil getsystemlocale set ok ok } -result {ok} -- cgit v0.12 From 1734eed89f76598661a4ce4c7d5e43ce7fe4368c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 19 Apr 2018 22:29:00 +0000 Subject: Slightly improved (more fail-safe) surrogate handling for TCL_UTF_MAX>3. Backported from latest TIP 389 implementation. (to be used for androwish) --- generic/tclUtf.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6255a4e..0d88d36 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -154,19 +154,26 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX > 3 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a High surrogate, so combine */ + buf[3] = (char) ((ch & 0x3F) | 0x80); + buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 4; + } + /* Previous Tcl_UniChar was not a High surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + /* Fill buffer with specific 3-byte (invalid) byte combination, + so following Low surrogate can recognize it and combine */ + buf[2] = (char) ((ch << 4) & 0x30); + buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); + buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); return 0; } } -- cgit v0.12