From 48b529209c87473364215e8aef740e331f88415a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" <nijtmans@users.sourceforge.net> Date: Tue, 7 Nov 2017 12:15:30 +0000 Subject: Somewhat simplified implementation of TIP #389, in which the "string length" if characters > U+FFFF is considered to be 2, not 1. --- doc/StringObj.3 | 2 +- doc/ToUpper.3 | 2 +- doc/Utf.3 | 2 +- generic/tcl.decls | 10 +++--- generic/tcl.h | 2 +- generic/tclCmdMZ.c | 18 ++++++++-- generic/tclDecls.h | 20 +++++------ generic/tclScan.c | 12 +++++-- generic/tclStringObj.c | 12 +++---- generic/tclUtf.c | 95 +++++++++++++++++++++++++++++++++++--------------- tests/string.test | 26 +++++++------- tests/utf.test | 4 +-- 12 files changed, 132 insertions(+), 73 deletions(-) diff --git a/doc/StringObj.3 b/doc/StringObj.3 index 7042cc8..8d9bb56 100644 --- a/doc/StringObj.3 +++ b/doc/StringObj.3 @@ -37,7 +37,7 @@ Tcl_UniChar * Tcl_UniChar * \fBTcl_GetUnicode\fR(\fIobjPtr\fR) .sp -Tcl_UniChar +int \fBTcl_GetUniChar\fR(\fIobjPtr, index\fR) .sp int diff --git a/doc/ToUpper.3 b/doc/ToUpper.3 index b933e9c..14766da 100644 --- a/doc/ToUpper.3 +++ b/doc/ToUpper.3 @@ -13,7 +13,7 @@ Tcl_UniCharToUpper, Tcl_UniCharToLower, Tcl_UniCharToTitle, Tcl_UtfToUpper, Tcl_ .nf \fB#include <tcl.h>\fR .sp -Tcl_UniChar +int \fBTcl_UniCharToUpper\fR(\fIch\fR) .sp Tcl_UniChar diff --git a/doc/Utf.3 b/doc/Utf.3 index 378c806..5cd6b7df 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -63,7 +63,7 @@ const char * const char * \fBTcl_UtfPrev\fR(\fIsrc, start\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) .sp const char * diff --git a/generic/tcl.decls b/generic/tcl.decls index b2b91a9..e3ea9bc 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1148,16 +1148,16 @@ declare 319 { Tcl_QueuePosition position) } declare 320 { - Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index) + int Tcl_UniCharAtIndex(const char *src, int index) } declare 321 { - Tcl_UniChar Tcl_UniCharToLower(int ch) + int Tcl_UniCharToLower(int ch) } declare 322 { - Tcl_UniChar Tcl_UniCharToTitle(int ch) + int Tcl_UniCharToTitle(int ch) } declare 323 { - Tcl_UniChar Tcl_UniCharToUpper(int ch) + int Tcl_UniCharToUpper(int ch) } declare 324 { int Tcl_UniCharToUtf(int ch, char *buf) @@ -1351,7 +1351,7 @@ declare 380 { int Tcl_GetCharLength(Tcl_Obj *objPtr) } declare 381 { - Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index) + int Tcl_GetUniChar(Tcl_Obj *objPtr, int index) } declare 382 { Tcl_UniChar *Tcl_GetUnicode(Tcl_Obj *objPtr) diff --git a/generic/tcl.h b/generic/tcl.h index 07d841d..f874997 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2201,7 +2201,7 @@ typedef struct Tcl_EncodingType { */ #ifndef TCL_UTF_MAX -#define TCL_UTF_MAX 3 +#define TCL_UTF_MAX 4 #endif /* diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 2195aa1..b6a8fe9 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -309,7 +309,7 @@ Tcl_RegexpObjCmd( eflags = 0; } else if (offset > stringLength) { eflags = TCL_REG_NOTBOL; - } else if (Tcl_GetUniChar(objPtr, offset-1) == (Tcl_UniChar)'\n') { + } else if (Tcl_GetUniChar(objPtr, offset-1) == '\n') { eflags = 0; } else { eflags = TCL_REG_NOTBOL; @@ -1218,6 +1218,12 @@ Tcl_SplitObjCmd( for ( ; stringPtr < end; stringPtr += len) { len = TclUtfToUniChar(stringPtr, &ch); +#if TCL_UTF_MAX == 4 + if (!len) { + continue; + } +#endif + /* * Assume Tcl_UniChar is an integral type... */ @@ -1814,8 +1820,16 @@ StringIsCmd( } end = string1 + length1; for (; string1 < end; string1 += length2, failat++) { + int fullchar; length2 = TclUtfToUniChar(string1, &ch); - if (!chcomp(ch)) { + fullchar = ch; +#if TCL_UTF_MAX == 4 + if (!length2) { + length2 = TclUtfToUniChar(string1, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + if (!chcomp(fullchar)) { result = 0; break; } diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 464fc0f..5f83636 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -959,13 +959,13 @@ EXTERN void Tcl_ThreadAlert(Tcl_ThreadId threadId); EXTERN void Tcl_ThreadQueueEvent(Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 320 */ -EXTERN Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index); +EXTERN int Tcl_UniCharAtIndex(const char *src, int index); /* 321 */ -EXTERN Tcl_UniChar Tcl_UniCharToLower(int ch); +EXTERN int Tcl_UniCharToLower(int ch); /* 322 */ -EXTERN Tcl_UniChar Tcl_UniCharToTitle(int ch); +EXTERN int Tcl_UniCharToTitle(int ch); /* 323 */ -EXTERN Tcl_UniChar Tcl_UniCharToUpper(int ch); +EXTERN int Tcl_UniCharToUpper(int ch); /* 324 */ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ @@ -1117,7 +1117,7 @@ EXTERN void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, /* 380 */ EXTERN int Tcl_GetCharLength(Tcl_Obj *objPtr); /* 381 */ -EXTERN Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index); +EXTERN int Tcl_GetUniChar(Tcl_Obj *objPtr, int index); /* 382 */ EXTERN Tcl_UniChar * Tcl_GetUnicode(Tcl_Obj *objPtr); /* 383 */ @@ -2186,10 +2186,10 @@ typedef struct TclStubs { Tcl_Obj * (*tcl_SetVar2Ex) (Tcl_Interp *interp, const char *part1, const char *part2, Tcl_Obj *newValuePtr, int flags); /* 317 */ void (*tcl_ThreadAlert) (Tcl_ThreadId threadId); /* 318 */ void (*tcl_ThreadQueueEvent) (Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 319 */ - Tcl_UniChar (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ - Tcl_UniChar (*tcl_UniCharToLower) (int ch); /* 321 */ - Tcl_UniChar (*tcl_UniCharToTitle) (int ch); /* 322 */ - Tcl_UniChar (*tcl_UniCharToUpper) (int ch); /* 323 */ + int (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ + int (*tcl_UniCharToLower) (int ch); /* 321 */ + int (*tcl_UniCharToTitle) (int ch); /* 322 */ + int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ CONST84_RETURN char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ @@ -2247,7 +2247,7 @@ typedef struct TclStubs { Tcl_Obj * (*tcl_NewUnicodeObj) (const Tcl_UniChar *unicode, int numChars); /* 378 */ void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 379 */ int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */ - Tcl_UniChar (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ + int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ Tcl_UniChar * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */ void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */ diff --git a/generic/tclScan.c b/generic/tclScan.c index e1fcad4..7f71262 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -885,9 +885,17 @@ Tcl_ScanObjCmd( * Scan a single Unicode character. */ - string += TclUtfToUniChar(string, &sch); + offset = TclUtfToUniChar(string, &sch); + i = (int)sch; +#if TCL_UTF_MAX == 4 + if (!offset) { + offset = Tcl_UtfToUniChar(string, &sch); + i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); + } +#endif + string += offset; if (!(flags & SCAN_SUPPRESS)) { - objPtr = Tcl_NewIntObj((int)sch); + objPtr = Tcl_NewIntObj(i); Tcl_IncrRefCount(objPtr); CLANG_ASSERT(objs); objs[objIndex++] = objPtr; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 3a35bcf..1ccd778 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -466,7 +466,7 @@ Tcl_GetCharLength( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_GetUniChar( Tcl_Obj *objPtr, /* The object to get the Unicode charater * from. */ @@ -483,7 +483,7 @@ Tcl_GetUniChar( if (TclIsPureByteArray(objPtr)) { unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL); - return (Tcl_UniChar) bytes[index]; + return (int) bytes[index]; } /* @@ -507,7 +507,7 @@ Tcl_GetUniChar( FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - return stringPtr->unicode[index]; + return (int) stringPtr->unicode[index]; } /* @@ -3462,7 +3462,6 @@ TclStringObjReverse( * Tcl_SetObjLength into growing the unicode rep buffer. */ - ch = 0; objPtr = Tcl_NewUnicodeObj(&ch, 1); Tcl_SetObjLength(objPtr, stringPtr->numChars); to = Tcl_GetUnicode(objPtr); @@ -3565,7 +3564,7 @@ ExtendUnicodeRepWithString( { String *stringPtr = GET_STRING(objPtr); int needed, numOrigChars = 0; - Tcl_UniChar *dst; + Tcl_UniChar *dst, unichar = 0; if (stringPtr->hasUnicode) { numOrigChars = stringPtr->numChars; @@ -3588,7 +3587,8 @@ ExtendUnicodeRepWithString( numAppendChars = 0; } for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) { - bytes += TclUtfToUniChar(bytes, dst); + bytes += TclUtfToUniChar(bytes, &unichar); + *dst = unichar; } *dst = 0; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 25cc2d1..859fe78 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -699,7 +699,7 @@ Tcl_UtfPrev( *--------------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharAtIndex( register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ @@ -819,7 +819,8 @@ int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, upChar; + Tcl_UniChar ch = 0; + int upChar; char *src, *dst; int bytes; @@ -830,7 +831,14 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); + upChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + upChar = Tcl_UniCharToUpper(upChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -872,7 +880,8 @@ int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, lowChar; + Tcl_UniChar ch = 0; + int lowChar; char *src, *dst; int bytes; @@ -883,7 +892,14 @@ Tcl_UtfToLower( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -926,7 +942,8 @@ int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, titleChar, lowChar; + Tcl_UniChar ch = 0; + int titleChar, lowChar; char *src, *dst; int bytes; @@ -939,7 +956,14 @@ Tcl_UtfToTitle( if (*src) { bytes = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); + titleChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + titleChar = Tcl_UniCharToTitle(titleChar); if (bytes < TclUtfCount(titleChar)) { memcpy(dst, src, (size_t) bytes); @@ -951,7 +975,14 @@ Tcl_UtfToTitle( } while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); if (bytes < TclUtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); @@ -1159,16 +1190,18 @@ TclUtfCasecmp( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToUpper( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); + if (GetCaseType(info) & 0x04) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1187,16 +1220,18 @@ Tcl_UniCharToUpper( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x02) { - ch += GetDelta(info); + if (GetCaseType(info) & 0x02) { + ch += GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1215,23 +1250,25 @@ Tcl_UniCharToLower( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToTitle( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ - ch += ((mode & 0x4) ? -1 : 1); - } else if (mode == 0x4) { - ch -= GetDelta(info); + ch += ((mode & 0x4) ? -1 : 1); + } else if (mode == 0x4) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* diff --git a/tests/string.test b/tests/string.test index cb901b9..cebaf4c 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1697,40 +1697,40 @@ test string-24.4 {string reverse command - unshared string} { string reverse $x$y } edcba test string-24.5 {string reverse command - shared unicode string} { - set x abcde\udead + set x abcde\ud0ad string reverse $x -} \udeadedcba +} \ud0adedcba test string-24.6 {string reverse command - unshared string} { set x abc - set y de\udead + set y de\ud0ad string reverse $x$y -} \udeadedcba +} \ud0adedcba test string-24.7 {string reverse command - simple case} { string reverse a } a test string-24.8 {string reverse command - simple case} { - string reverse \udead -} \udead + string reverse \ud0ad +} \ud0ad test string-24.9 {string reverse command - simple case} { string reverse {} } {} test string-24.10 {string reverse command - corner case} { - set x \ubeef\udead + set x \ubeef\ud0ad string reverse $x -} \udead\ubeef +} \ud0ad\ubeef test string-24.11 {string reverse command - corner case} { set x \ubeef - set y \udead + set y \ud0ad string reverse $x$y -} \udead\ubeef +} \ud0ad\ubeef test string-24.12 {string reverse command - corner case} { set x \ubeef - set y \udead + set y \ud0ad string is ascii [string reverse $x$y] } 0 test string-24.13 {string reverse command - pure Unicode string} { - string reverse [string range \ubeef\udead\ubeef\udead\ubeef\udead 1 5] -} \udead\ubeef\udead\ubeef\udead + string reverse [string range \ubeef\ud0ad\ubeef\ud0ad\ubeef\ud0ad 1 5] +} \ud0ad\ubeef\ud0ad\ubeef\ud0ad test string-24.14 {string reverse command - pure bytearray} { binary scan [string reverse [binary format H* 010203]] H* x set x diff --git a/tests/utf.test b/tests/utf.test index 422ab08..45f9c0c 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -68,10 +68,10 @@ test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestrin } {1} test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body { string length [testbytestring "\xF0\x90\x80\x80"] -} -result {1} +} -result {2} test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] -} -result {1} +} -result {2} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} -- cgit v0.12