diff options
-rw-r--r-- | generic/tclEncoding.c | 2 | ||||
-rw-r--r-- | generic/tclInt.h | 7 | ||||
-rw-r--r-- | generic/tclUtf.c | 58 | ||||
-rw-r--r-- | tests/utf.test | 45 |
4 files changed, 71 insertions, 41 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 1584de0..5c7aab8 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2341,7 +2341,7 @@ UtfToUtfProc( *dst++ = 0; *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUniChar. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclInt.h b/generic/tclInt.h index 593d878..5c46470 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3184,8 +3184,13 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +/* + * Bytes F0-F4 are start-bytes for 4-byte sequences. + * Byte 0xED can be the start-byte of an upper surrogate. In that case, + * TclUtfToUCS4() might read the lower surrogate following it too. + */ # define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) + ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length))) MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr, diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 9ffbfba..9375a01 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -81,6 +81,28 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1 }; +static const unsigned char complete[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#if TCL_UTF_MAX > 4 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#endif + 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#if TCL_UTF_MAX > 3 + 4,4,4,4,4, +#else + 1,1,1,1,1, +#endif + 1,1,1,1,1,1,1,1,1,1,1 +}; + /* * Functions used only in this module. */ @@ -359,8 +381,8 @@ Tcl_UniCharToUtfDString( int Tcl_UtfToUniChar( - register const char *src, /* The UTF-8 string. */ - register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by + const char *src, /* The UTF-8 string. */ + Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { Tcl_UniChar byte; @@ -557,7 +579,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= totalBytes[UCHAR(*src)]; + return length >= complete[UCHAR(*src)]; } /* @@ -580,12 +602,12 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register const char *src, /* The UTF-8 string to measure. */ + const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { Tcl_UniChar ch = 0; - register int i = 0; + int i = 0; /* * The separate implementations are faster. @@ -601,27 +623,29 @@ Tcl_NumUtfChars( } if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { - register const char *endPtr = src + length - TCL_UTF_MAX; + const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { +#if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; - } else { - src += TclUtfToUniChar(src, &ch); - } + } else +#endif + src += TclUtfToUniChar(src, &ch); i++; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { +#if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; - } else { - src += TclUtfToUniChar(src, &ch); - } + } else +#endif + src += TclUtfToUniChar(src, &ch); i++; } if (src < endPtr) { @@ -890,8 +914,8 @@ Tcl_UtfPrev( Tcl_UniChar Tcl_UniCharAtIndex( - register const char *src, /* The UTF-8 string to dereference. */ - register int index) /* The position of the desired character. */ + const char *src, /* The UTF-8 string to dereference. */ + int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; @@ -918,8 +942,8 @@ Tcl_UniCharAtIndex( const char * Tcl_UtfAtIndex( - register const char *src, /* The UTF-8 string. */ - register int index) /* The position of the desired character. */ + const char *src, /* The UTF-8 string. */ + int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; int len = 0; @@ -1191,7 +1215,7 @@ TclpUtfNcmp2( * fine in the strcmp manner. */ - register int result = 0; + int result = 0; for ( ; numBytes != 0; numBytes--, cs++, ct++) { if (*cs != *ct) { diff --git a/tests/utf.test b/tests/utf.test index 0929801..50351cb 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -29,6 +29,7 @@ testConstraint pre388 [eq \x741 A] testConstraint pairsTo4bytes [expr {[llength [info commands teststringbytes]] && [string length [teststringbytes \uD83D\uDCA9]] == 4}] +testConstraint teststringbytes [llength [info commands teststringbytes]] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint testfindfirst [llength [info commands testfindfirst]] testConstraint testfindlast [llength [info commands testfindlast]] @@ -501,7 +502,7 @@ test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} { } 1 test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} { testutfnext \x80\x80\x80 -} 1 +} 3 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { testutfnext \xA0\xA0\xA0\xA0 } 1 @@ -601,18 +602,18 @@ test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { test utf-6.118 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0G 0 } 0 -test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +test utf-6.119 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0G 1 -} 1 -test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 0 +test utf-6.120 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0 1 -} 1 -test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 0 +test utf-6.121 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0G 2 -} 1 -test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 0 +test utf-6.122 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0 2 -} 1 +} 0 test utf-6.123 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0G 3 } 1 @@ -990,9 +991,9 @@ test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 { test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 { string index \uD842 0 } "\uD842" -test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 { - string index \uD842 0 -} "\uD842" +test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes tip389} { + teststringbytes [string index \uD842 0] +} \xF0 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { string index \uDC42 0 } "\uDC42" @@ -1002,18 +1003,18 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 0 } "\U1F600" -test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 { - string index \uD83D\uDE00G 0 -} "\U1F600" +test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} { + teststringbytes [string index \uD83D\uDE00G 0] +} \xF0 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 1 } "\uDE00" test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 1 } G -test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} tip389 { - string index \uD83D\uDE00G 1 -} {} +test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} { + teststringbytes [string index \uD83D\uDE00G 1] +} \xED\xB8\x80 test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 2 } G @@ -1029,9 +1030,9 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 0 } "\U1F600" -test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { - string index \U1F600G 0 -} "\U1F600" +test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc teststringbytes tip389} { + teststringbytes [string index \U1F600G 0] +} \xF0 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { string index \U1F600G 1 } G @@ -1040,7 +1041,7 @@ test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { } G test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { string index \U1F600G 1 -} {} +} \uDE00 test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { string index \U1F600G 2 } {} |