diff options
| -rw-r--r-- | doc/StringObj.3 | 5 | ||||
| -rw-r--r-- | doc/ToUpper.3 | 6 | ||||
| -rw-r--r-- | doc/Utf.3 | 5 | ||||
| -rw-r--r-- | generic/tcl.decls | 10 | ||||
| -rw-r--r-- | generic/tcl.h | 6 | ||||
| -rw-r--r-- | generic/tclCmdMZ.c | 16 | ||||
| -rw-r--r-- | generic/tclDecls.h | 20 | ||||
| -rw-r--r-- | generic/tclExecute.c | 6 | ||||
| -rw-r--r-- | generic/tclStringObj.c | 70 | ||||
| -rw-r--r-- | generic/tclUtf.c | 211 | ||||
| -rw-r--r-- | tests/string.test | 3 |
11 files changed, 223 insertions, 135 deletions
diff --git a/doc/StringObj.3 b/doc/StringObj.3 index 7042cc8..e011c27 100644 --- a/doc/StringObj.3 +++ b/doc/StringObj.3 @@ -37,7 +37,7 @@ Tcl_UniChar * Tcl_UniChar * \fBTcl_GetUnicode\fR(\fIobjPtr\fR) .sp -Tcl_UniChar +int \fBTcl_GetUniChar\fR(\fIobjPtr, index\fR) .sp int @@ -204,7 +204,8 @@ where the caller does not need the length of the unicode string representation. .PP \fBTcl_GetUniChar\fR returns the \fIindex\fR'th character in the -value's Unicode representation. +value's Unicode representation. If the index is out of range or +it references a low surrogate preceded by a high surrogate, it returns -1; .PP \fBTcl_GetRange\fR returns a newly created value comprised of the characters between \fIfirst\fR and \fIlast\fR (inclusive) in the diff --git a/doc/ToUpper.3 b/doc/ToUpper.3 index be614e7..1c7a0c2 100644 --- a/doc/ToUpper.3 +++ b/doc/ToUpper.3 @@ -13,13 +13,13 @@ Tcl_UniCharToUpper, Tcl_UniCharToLower, Tcl_UniCharToTitle, Tcl_UtfToUpper, Tcl_ .nf \fB#include <tcl.h>\fR .sp -Tcl_UniChar +int \fBTcl_UniCharToUpper\fR(\fIch\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharToLower\fR(\fIch\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharToTitle\fR(\fIch\fR) .sp int @@ -63,7 +63,7 @@ const char * const char * \fBTcl_UtfPrev\fR(\fIsrc, start\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) .sp const char * @@ -140,6 +140,9 @@ number of bytes read from \fIsrc\fR. The caller must ensure that the source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is +a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the +cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR +and returns 1. If the input is otherwise not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and 0x00ff and return 1. diff --git a/generic/tcl.decls b/generic/tcl.decls index 60aebfd..da551bb 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1145,16 +1145,16 @@ declare 319 { Tcl_QueuePosition position) } declare 320 { - Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index) + int Tcl_UniCharAtIndex(const char *src, int index) } declare 321 { - Tcl_UniChar Tcl_UniCharToLower(int ch) + int Tcl_UniCharToLower(int ch) } declare 322 { - Tcl_UniChar Tcl_UniCharToTitle(int ch) + int Tcl_UniCharToTitle(int ch) } declare 323 { - Tcl_UniChar Tcl_UniCharToUpper(int ch) + int Tcl_UniCharToUpper(int ch) } declare 324 { int Tcl_UniCharToUtf(int ch, char *buf) @@ -1348,7 +1348,7 @@ declare 380 { int Tcl_GetCharLength(Tcl_Obj *objPtr) } declare 381 { - Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index) + int Tcl_GetUniChar(Tcl_Obj *objPtr, int index) } declare 382 { Tcl_UniChar *Tcl_GetUnicode(Tcl_Obj *objPtr) diff --git a/generic/tcl.h b/generic/tcl.h index 45fa0ac..c3e0f9d 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2155,7 +2155,7 @@ typedef struct Tcl_EncodingType { */ #ifndef TCL_UTF_MAX -#define TCL_UTF_MAX 3 +#define TCL_UTF_MAX 4 #endif /* @@ -2345,10 +2345,10 @@ typedef int (Tcl_NRPostProc) (ClientData data[], Tcl_Interp *interp, /* *---------------------------------------------------------------------------- * The following constant is used to test for older versions of Tcl in the - * stubs tables. + * stubs tables. If TCL_UTF_MAX>4 use a different value. */ -#define TCL_STUB_MAGIC ((int) 0xFCA3BACF) +#define TCL_STUB_MAGIC ((int) 0xFCA3BACF + (TCL_UTF_MAX>4)) /* * The following function is required to be defined in all stubs aware diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 43d6206..6aaf1de 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -491,7 +491,7 @@ Tcl_RegsubObjCmd( Tcl_RegExp regExpr; Tcl_RegExpInfo info; Tcl_Obj *resultPtr, *subPtr, *objPtr, *startIndex = NULL; - Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec, *wend; + Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec = 0, *wend; static const char *const options[] = { "-all", "-command", "-expanded", "-line", @@ -1220,7 +1220,7 @@ Tcl_SplitObjCmd( len = TclUtfToUniChar(stringPtr, &ch); fullchar = ch; -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 if (!len) { len += TclUtfToUniChar(stringPtr, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; @@ -1419,7 +1419,7 @@ StringIndexCmd( } /* - * Get the char length to calulate what 'end' means. + * Get the char length to calculate what 'end' means. */ length = Tcl_GetCharLength(objv[1]); @@ -1428,7 +1428,11 @@ StringIndexCmd( } if ((index >= 0) && (index < length)) { - Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index); + int ch = Tcl_GetUniChar(objv[1], index); + + if (ch == -1) { + return TCL_OK; + } /* * If we have a ByteArray object, we're careful to generate a new @@ -1440,7 +1444,7 @@ StringIndexCmd( Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { - char buf[TCL_UTF_MAX]; + char buf[4]; length = Tcl_UniCharToUtf(ch, buf); Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length)); @@ -1803,7 +1807,7 @@ StringIsCmd( int fullchar; length2 = TclUtfToUniChar(string1, &ch); fullchar = ch; -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 if (!length2) { length2 = TclUtfToUniChar(string1, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 49f34e8..592d945 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -962,13 +962,13 @@ EXTERN void Tcl_ThreadAlert(Tcl_ThreadId threadId); EXTERN void Tcl_ThreadQueueEvent(Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 320 */ -EXTERN Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index); +EXTERN int Tcl_UniCharAtIndex(const char *src, int index); /* 321 */ -EXTERN Tcl_UniChar Tcl_UniCharToLower(int ch); +EXTERN int Tcl_UniCharToLower(int ch); /* 322 */ -EXTERN Tcl_UniChar Tcl_UniCharToTitle(int ch); +EXTERN int Tcl_UniCharToTitle(int ch); /* 323 */ -EXTERN Tcl_UniChar Tcl_UniCharToUpper(int ch); +EXTERN int Tcl_UniCharToUpper(int ch); /* 324 */ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ @@ -1123,7 +1123,7 @@ EXTERN void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, /* 380 */ EXTERN int Tcl_GetCharLength(Tcl_Obj *objPtr); /* 381 */ -EXTERN Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index); +EXTERN int Tcl_GetUniChar(Tcl_Obj *objPtr, int index); /* 382 */ EXTERN Tcl_UniChar * Tcl_GetUnicode(Tcl_Obj *objPtr); /* 383 */ @@ -2193,10 +2193,10 @@ typedef struct TclStubs { Tcl_Obj * (*tcl_SetVar2Ex) (Tcl_Interp *interp, const char *part1, const char *part2, Tcl_Obj *newValuePtr, int flags); /* 317 */ void (*tcl_ThreadAlert) (Tcl_ThreadId threadId); /* 318 */ void (*tcl_ThreadQueueEvent) (Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 319 */ - Tcl_UniChar (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ - Tcl_UniChar (*tcl_UniCharToLower) (int ch); /* 321 */ - Tcl_UniChar (*tcl_UniCharToTitle) (int ch); /* 322 */ - Tcl_UniChar (*tcl_UniCharToUpper) (int ch); /* 323 */ + int (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ + int (*tcl_UniCharToLower) (int ch); /* 321 */ + int (*tcl_UniCharToTitle) (int ch); /* 322 */ + int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ const char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ @@ -2254,7 +2254,7 @@ typedef struct TclStubs { Tcl_Obj * (*tcl_NewUnicodeObj) (const Tcl_UniChar *unicode, int numChars); /* 378 */ void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 379 */ int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */ - Tcl_UniChar (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ + int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ Tcl_UniChar * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */ void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */ diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 49c74da..35743a1 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5292,8 +5292,8 @@ TEBCresume( objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { - char buf[TCL_UTF_MAX]; - Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index); + char buf[4]; + int ch = Tcl_GetUniChar(valuePtr, index); /* * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) @@ -5301,7 +5301,7 @@ TEBCresume( * practical use. */ - length = Tcl_UniCharToUtf(ch, buf); + length = (ch != -1) ? Tcl_UniCharToUtf(ch, buf) : 0; objResultPtr = Tcl_NewStringObj(buf, length); } diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index fa50d6d..5915ce0 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -460,8 +460,9 @@ Tcl_GetCharLength( * * Tcl_GetUniChar -- * - * Get the index'th Unicode character from the String object. The index - * is assumed to be in the appropriate range. + * Get the index'th Unicode character from the String object. If index + * is out of range or it references a low surrogate preceded by a high + * surrogate, the result = -1; * * Results: * Returns the index'th Unicode character in the Object. @@ -472,13 +473,18 @@ Tcl_GetCharLength( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_GetUniChar( Tcl_Obj *objPtr, /* The object to get the Unicode charater * from. */ int index) /* Get the index'th Unicode character. */ { String *stringPtr; + int ch, length; + + if (index < 0) { + return -1; + } /* * Optimize the case where we're really dealing with a bytearray object @@ -486,9 +492,12 @@ Tcl_GetUniChar( */ if (TclIsPureByteArray(objPtr)) { - unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL); + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + if (index >= length) { + return -1; + } - return (Tcl_UniChar) bytes[index]; + return (int) bytes[index]; } /* @@ -512,7 +521,26 @@ Tcl_GetUniChar( FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - return stringPtr->unicode[index]; + + if (index >= stringPtr->numChars) { + return -1; + } + ch = stringPtr->unicode[index]; +#if TCL_UTF_MAX <= 4 + /* See: bug [11ae2be95dac9417] */ + if ((ch&0xF800) == 0xD800) { + if (ch&0x400) { + if ((index > 0) && ((stringPtr->unicode[index-1]&0xFC00) == 0xD800)) { + ch = -1; /* low surrogate preceded by high surrogate */ + } + } else if ((++index < stringPtr->numChars) + && ((stringPtr->unicode[index]&0xFC00) == 0xDC00)) { + /* high surrogate followed by low surrogate */ + ch = (((ch & 0x3FF) << 10) | (stringPtr->unicode[index] & 0x3FF)) + 0x10000; + } + } +#endif + return ch; } /* @@ -612,15 +640,24 @@ Tcl_GetRange( { Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ String *stringPtr; + int length; + if (first < 0) { + first = 0; + } /* * Optimize the case where we're really dealing with a bytearray object * we don't need to convert to a string to perform the substring operation. */ if (TclIsPureByteArray(objPtr)) { - unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL); - + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + if (last >= length) { + last = length - 1; + } + if (last < first) { + return Tcl_NewObj(); + } return Tcl_NewByteArrayObj(bytes+first, last-first+1); } @@ -640,6 +677,12 @@ Tcl_GetRange( TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { + if (last >= stringPtr->numChars) { + last = stringPtr->numChars - 1; + } + if (last < first) { + return Tcl_NewObj(); + } newObjPtr = Tcl_NewStringObj(objPtr->bytes + first, last-first+1); /* @@ -654,8 +697,13 @@ Tcl_GetRange( FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - -#if TCL_UTF_MAX == 4 + if (last > stringPtr->numChars) { + last = stringPtr->numChars; + } + if (last < first) { + return Tcl_NewObj(); + } +#if TCL_UTF_MAX <= 4 /* See: bug [11ae2be95dac9417] */ if ((first>0) && ((stringPtr->unicode[first]&0xFC00) == 0xDC00) && ((stringPtr->unicode[first-1]&0xFC00) == 0xD800)) { @@ -1930,7 +1978,7 @@ Tcl_AppendFormatToObj( } break; case 'c': { - char buf[TCL_UTF_MAX]; + char buf[4]; int code, length; if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 2d8750d..33c22e0 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -98,11 +98,9 @@ TclUtfCount( if (ch <= 0x7FF) { return 2; } -#if TCL_UTF_MAX > 3 if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { return 4; } -#endif return 3; } @@ -131,7 +129,7 @@ Tcl_UniCharToUtf( char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character - * (at most TCL_UTF_MAX bytes). */ + * (at most 4 bytes). */ { if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; @@ -144,27 +142,30 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a High surrogate, so combine */ + buf[3] = (char) ((ch & 0x3F) | 0x80); + buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 4; + } + /* Previous Tcl_UniChar was not a High surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + /* Fill buffer with specific 3-byte (invalid) byte combination, + so following Low surrogate can recognize it and combine */ + buf[2] = (char) ((ch << 4) & 0x30); + buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); + buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); return 0; } } -#endif goto three; } - -#if TCL_UTF_MAX > 3 if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); @@ -172,7 +173,6 @@ Tcl_UniCharToUtf( buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } -#endif } ch = 0xFFFD; @@ -268,6 +268,13 @@ Tcl_UniCharToUtfDString( *--------------------------------------------------------------------------- */ +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; + int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ @@ -284,11 +291,17 @@ Tcl_UtfToUniChar( if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid * characters representing themselves. */ - *chPtr = (Tcl_UniChar) byte; + if ((unsigned)(byte-0x80) < (unsigned) 0x20) { + *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; + } else { + *chPtr = (Tcl_UniChar) byte; + } return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { @@ -329,17 +342,7 @@ Tcl_UtfToUniChar( /* * Four-byte-character lead byte followed by three trail bytes. */ -#if TCL_UTF_MAX == 3 - byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; - if (byte & 0x100000) { - /* out of range, < 0x10000 or > 0x10ffff */ - } else { - /* produce replacement character, and advance source pointer */ - *chPtr = (Tcl_UniChar) 0xFFFD; - return 4; - } -#elif TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 Tcl_UniChar surrogate; byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) @@ -551,7 +554,7 @@ Tcl_UtfFindFirst( while (1) { len = TclUtfToUniChar(src, &find); fullchar = find; -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 if (!len) { len += TclUtfToUniChar(src, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; @@ -599,7 +602,7 @@ Tcl_UtfFindLast( while (1) { len = TclUtfToUniChar(src, &find); fullchar = find; -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 if (!len) { len += TclUtfToUniChar(src, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; @@ -642,7 +645,7 @@ Tcl_UtfNext( Tcl_UniChar ch = 0; int len = TclUtfToUniChar(src, &ch); -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 if (len == 0) { len = TclUtfToUniChar(src, &ch); } @@ -717,17 +720,33 @@ Tcl_UtfPrev( *--------------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharAtIndex( register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; + int fullchar = 0; +#if TCL_UTF_MAX <= 4 + int len = 1; +#endif while (index-- >= 0) { +#if TCL_UTF_MAX <= 4 + src += (len = TclUtfToUniChar(src, &ch)); +#else src += TclUtfToUniChar(src, &ch); +#endif } - return ch; + fullchar = ch; +#if TCL_UTF_MAX <= 4 + if (!len) { + /* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */ + (void)TclUtfToUniChar(src, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + return fullchar; } /* @@ -835,7 +854,8 @@ int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, upChar; + Tcl_UniChar ch = 0; + int upChar; char *src, *dst; int bytes; @@ -846,7 +866,14 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); + upChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + upChar = Tcl_UniCharToUpper(upChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -888,7 +915,8 @@ int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, lowChar; + Tcl_UniChar ch = 0; + int lowChar; char *src, *dst; int bytes; @@ -899,7 +927,14 @@ Tcl_UtfToLower( src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); /* * To keep badly formed Utf strings from getting inflated by the @@ -942,7 +977,8 @@ int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, titleChar, lowChar; + Tcl_UniChar ch = 0; + int titleChar, lowChar; char *src, *dst; int bytes; @@ -955,7 +991,14 @@ Tcl_UtfToTitle( if (*src) { bytes = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); + titleChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + titleChar = Tcl_UniCharToTitle(titleChar); if (bytes < TclUtfCount(titleChar)) { memcpy(dst, src, (size_t) bytes); @@ -967,7 +1010,14 @@ Tcl_UtfToTitle( } while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); if (bytes < TclUtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); @@ -1069,7 +1119,7 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* Surrogates always report higher than non-surrogates */ if (((ch1 & 0xFC00) == 0xD800)) { if ((ch2 & 0xFC00) != 0xD800) { @@ -1120,7 +1170,7 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* Surrogates always report higher than non-surrogates */ if (((ch1 & 0xFC00) == 0xD800)) { if ((ch2 & 0xFC00) != 0xD800) { @@ -1169,7 +1219,7 @@ TclUtfCmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* Surrogates always report higher than non-surrogates */ if (((ch1 & 0xFC00) == 0xD800)) { if ((ch2 & 0xFC00) != 0xD800) { @@ -1215,7 +1265,7 @@ TclUtfCasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX <= 4 /* Surrogates always report higher than non-surrogates */ if (((ch1 & 0xFC00) == 0xD800)) { if ((ch2 & 0xFC00) != 0xD800) { @@ -1252,16 +1302,18 @@ TclUtfCasecmp( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToUpper( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); + if (GetCaseType(info) & 0x04) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1280,16 +1332,18 @@ Tcl_UniCharToUpper( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x02) { - ch += GetDelta(info); + if (GetCaseType(info) & 0x02) { + ch += GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1308,23 +1362,25 @@ Tcl_UniCharToLower( *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToTitle( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ - ch += ((mode & 0x4) ? -1 : 1); - } else if (mode == 0x4) { - ch -= GetDelta(info); + ch += ((mode & 0x4) ? -1 : 1); + } else if (mode == 0x4) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* @@ -1458,11 +1514,9 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -1486,11 +1540,9 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -1514,7 +1566,6 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { ch &= 0x1FFFFF; if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) { @@ -1525,7 +1576,6 @@ Tcl_UniCharIsControl( } return 0; } -#endif return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -1549,11 +1599,9 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -1577,12 +1625,10 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { ch &= 0x1FFFFF; return (ch >= 0xE0100) && (ch <= 0xE01EF); } -#endif return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -1606,11 +1652,9 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -1634,12 +1678,10 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { ch &= 0x1FFFFF; return (ch >= 0xE0100) && (ch <= 0xE01EF); } -#endif return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -1663,11 +1705,9 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -1691,13 +1731,8 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 /* Ignore upper 11 bits. */ ch &= 0x1FFFFF; -#else - /* Ignore upper 16 bits. */ - ch &= 0xFFFF; -#endif /* * If the character is within the first 127 characters, just use the @@ -1706,10 +1741,8 @@ Tcl_UniCharIsSpace( if (ch < 0x80) { return TclIsSpaceProc((char) ch); -#if TCL_UTF_MAX > 3 } else if (UNICODE_OUT_OF_RANGE(ch)) { return 0; -#endif } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { return 1; @@ -1738,11 +1771,9 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -1766,11 +1797,9 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } -#endif return ((WORD_BITS >> GetCategory(ch)) & 1); } diff --git a/tests/string.test b/tests/string.test index da302eb..2300f08 100644 --- a/tests/string.test +++ b/tests/string.test @@ -487,6 +487,9 @@ test string-5.19.$noComp {string index, bytearray object out of bounds} { test string-5.20.$noComp {string index, bytearray object out of bounds} { run {string index [binary format I* {0x50515253 0x52}] 20} } {} +test string-5.21 {string index, surrogates, bug [11ae2be95dac9417]} fullutf { + list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3] +} [list \U100000 {} b] proc largest_int {} { |
