From cc90266615cb98853ebc61301119d1f7a3718d7a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 2 May 2020 10:15:29 +0000 Subject: More fixes for [ed29806baf]. Not working yet. WIP --- generic/tclEncoding.c | 2 +- generic/tclInt.h | 7 ++++++- generic/tclUtf.c | 58 ++++++++++++++++++++++++++++++++++++--------------- tests/utf.test | 45 ++++++++++++++++++++------------------- 4 files changed, 71 insertions(+), 41 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 1584de0..5c7aab8 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2341,7 +2341,7 @@ UtfToUtfProc( *dst++ = 0; *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUniChar. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclInt.h b/generic/tclInt.h index 593d878..5c46470 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3184,8 +3184,13 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +/* + * Bytes F0-F4 are start-bytes for 4-byte sequences. + * Byte 0xED can be the start-byte of an upper surrogate. In that case, + * TclUtfToUCS4() might read the lower surrogate following it too. + */ # define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) + ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length))) MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr, diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 9ffbfba..9375a01 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -81,6 +81,28 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1 }; +static const unsigned char complete[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#if TCL_UTF_MAX > 4 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#endif + 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#if TCL_UTF_MAX > 3 + 4,4,4,4,4, +#else + 1,1,1,1,1, +#endif + 1,1,1,1,1,1,1,1,1,1,1 +}; + /* * Functions used only in this module. */ @@ -359,8 +381,8 @@ Tcl_UniCharToUtfDString( int Tcl_UtfToUniChar( - register const char *src, /* The UTF-8 string. */ - register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by + const char *src, /* The UTF-8 string. */ + Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { Tcl_UniChar byte; @@ -557,7 +579,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= totalBytes[UCHAR(*src)]; + return length >= complete[UCHAR(*src)]; } /* @@ -580,12 +602,12 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register const char *src, /* The UTF-8 string to measure. */ + const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { Tcl_UniChar ch = 0; - register int i = 0; + int i = 0; /* * The separate implementations are faster. @@ -601,27 +623,29 @@ Tcl_NumUtfChars( } if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { - register const char *endPtr = src + length - TCL_UTF_MAX; + const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { +#if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; - } else { - src += TclUtfToUniChar(src, &ch); - } + } else +#endif + src += TclUtfToUniChar(src, &ch); i++; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { +#if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; - } else { - src += TclUtfToUniChar(src, &ch); - } + } else +#endif + src += TclUtfToUniChar(src, &ch); i++; } if (src < endPtr) { @@ -890,8 +914,8 @@ Tcl_UtfPrev( Tcl_UniChar Tcl_UniCharAtIndex( - register const char *src, /* The UTF-8 string to dereference. */ - register int index) /* The position of the desired character. */ + const char *src, /* The UTF-8 string to dereference. */ + int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; @@ -918,8 +942,8 @@ Tcl_UniCharAtIndex( const char * Tcl_UtfAtIndex( - register const char *src, /* The UTF-8 string. */ - register int index) /* The position of the desired character. */ + const char *src, /* The UTF-8 string. */ + int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; int len = 0; @@ -1191,7 +1215,7 @@ TclpUtfNcmp2( * fine in the strcmp manner. */ - register int result = 0; + int result = 0; for ( ; numBytes != 0; numBytes--, cs++, ct++) { if (*cs != *ct) { diff --git a/tests/utf.test b/tests/utf.test index 0929801..50351cb 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -29,6 +29,7 @@ testConstraint pre388 [eq \x741 A] testConstraint pairsTo4bytes [expr {[llength [info commands teststringbytes]] && [string length [teststringbytes \uD83D\uDCA9]] == 4}] +testConstraint teststringbytes [llength [info commands teststringbytes]] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint testfindfirst [llength [info commands testfindfirst]] testConstraint testfindlast [llength [info commands testfindlast]] @@ -501,7 +502,7 @@ test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} { } 1 test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} { testutfnext \x80\x80\x80 -} 1 +} 3 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { testutfnext \xA0\xA0\xA0\xA0 } 1 @@ -601,18 +602,18 @@ test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { test utf-6.118 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0G 0 } 0 -test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +test utf-6.119 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0G 1 -} 1 -test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 0 +test utf-6.120 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0 1 -} 1 -test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 0 +test utf-6.121 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0G 2 -} 1 -test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 0 +test utf-6.122 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0 2 -} 1 +} 0 test utf-6.123 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0G 3 } 1 @@ -990,9 +991,9 @@ test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 { test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 { string index \uD842 0 } "\uD842" -test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 { - string index \uD842 0 -} "\uD842" +test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes tip389} { + teststringbytes [string index \uD842 0] +} \xF0 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { string index \uDC42 0 } "\uDC42" @@ -1002,18 +1003,18 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 0 } "\U1F600" -test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 { - string index \uD83D\uDE00G 0 -} "\U1F600" +test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} { + teststringbytes [string index \uD83D\uDE00G 0] +} \xF0 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 1 } "\uDE00" test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 1 } G -test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} tip389 { - string index \uD83D\uDE00G 1 -} {} +test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} { + teststringbytes [string index \uD83D\uDE00G 1] +} \xED\xB8\x80 test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 2 } G @@ -1029,9 +1030,9 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 0 } "\U1F600" -test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { - string index \U1F600G 0 -} "\U1F600" +test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc teststringbytes tip389} { + teststringbytes [string index \U1F600G 0] +} \xF0 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { string index \U1F600G 1 } G @@ -1040,7 +1041,7 @@ test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { } G test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { string index \U1F600G 1 -} {} +} \uDE00 test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { string index \U1F600G 2 } {} -- cgit v0.12 From 797bc84ed2bc1f24a8adc1e42a91ef90d2c0c91f Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 2 May 2020 21:54:14 +0000 Subject: Seems almost correct. Still problem with "string index" for TCL_UTF_MAX>3 --- generic/tclUtf.c | 25 +++++------- tests/utf.test | 115 +++++++++++++++++++++++++++---------------------------- 2 files changed, 66 insertions(+), 74 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 9375a01..03a7ca9 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,13 +64,10 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -#if TCL_UTF_MAX != 4 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ +/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#endif +/* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #if TCL_UTF_MAX > 3 @@ -80,7 +77,7 @@ static const unsigned char totalBytes[256] = { #endif 1,1,1,1,1,1,1,1,1,1,1 }; - + static const unsigned char complete[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -95,7 +92,7 @@ static const unsigned char complete[256] = { #endif 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 4,4,4,4,4, #else 1,1,1,1,1, @@ -626,26 +623,24 @@ Tcl_NumUtfChars( const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { -#if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; - } else -#endif - src += TclUtfToUniChar(src, &ch); + } else { + src += TclUtfToUniChar(src, &ch); + } i++; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { -#if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; - } else -#endif - src += TclUtfToUniChar(src, &ch); + } else { + src += TclUtfToUniChar(src, &ch); + } i++; } if (src < endPtr) { diff --git a/tests/utf.test b/tests/utf.test index 50351cb..71b4978 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -398,7 +398,7 @@ test utf-6.68 {Tcl_UtfNext} testutfnext { test utf-6.69.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0 } 1 -test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { +test utf-6.69.1 {Tcl_UtfNext} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0 } 4 test utf-6.70 {Tcl_UtfNext} testutfnext { @@ -416,37 +416,37 @@ test utf-6.73 {Tcl_UtfNext} testutfnext { test utf-6.74.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0G } 1 -test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { +test utf-6.74.1 {Tcl_UtfNext} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0G } 4 test utf-6.75.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 -test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { +test utf-6.75.1 {Tcl_UtfNext} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 4 test utf-6.76.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 -test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { +test utf-6.76.1 {Tcl_UtfNext} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 4 test utf-6.77.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 -test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { +test utf-6.77.1 {Tcl_UtfNext} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 4 test utf-6.78.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 -test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { +test utf-6.78.1 {Tcl_UtfNext} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 4 test utf-6.79.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 -test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { +test utf-6.79.1 {Tcl_UtfNext} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -473,7 +473,7 @@ test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext ucs2} { testutfnext \xF0\x90\x80\x80 } 1 -test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { +test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext ucs4} { testutfnext \xF0\x90\x80\x80 } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} testutfnext { @@ -485,7 +485,7 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} tes test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} { testutfnext \xF4\x8F\xBF\xBF } 1 -test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { +test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs4} { testutfnext \xF4\x8F\xBF\xBF } 4 test utf-6.91.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} { @@ -497,12 +497,9 @@ test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { testutfnext \xA0\xA0\xA0 } 1 -test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} { +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { testutfnext \x80\x80\x80 } 1 -test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf} { - testutfnext \x80\x80\x80 -} 3 test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { testutfnext \xA0\xA0\xA0\xA0 } 1 @@ -554,64 +551,64 @@ test utf-6.109 {Tcl_UtfNext, read limits} testutfnext { test utf-6.110.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0G 1 } 1 -test utf-6.110.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.110.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0G 1 } 0 test utf-6.111.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0G 2 } 1 -test utf-6.111.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.111.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0G 2 } 0 test utf-6.112.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0G 3 } 1 -test utf-6.112.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.112.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0G 3 } 0 test utf-6.113.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0G 4 } 1 -test utf-6.113.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.113.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0G 4 } 4 test utf-6.114.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xA0 1 } 1 -test utf-6.114.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.114.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xA0 1 } 0 test utf-6.115.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xA0 2 } 1 -test utf-6.115.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.115.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xA0 2 } 0 test utf-6.116.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xA0 3 } 1 -test utf-6.116.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.116.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xA0 3 } 0 test utf-6.117.0 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xF2\xA0\xA0\xA0\xA0 4 } 1 -test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext fullutf} { +test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext ucs4} { testutfnext \xF2\xA0\xA0\xA0\xA0 4 } 4 test utf-6.118 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0G 0 } 0 -test utf-6.119 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0G 1 } 0 -test utf-6.120 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0 1 } 0 -test utf-6.121 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0G 2 } 0 -test utf-6.122 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0 2 } 0 test utf-6.123 {Tcl_UtfNext, read limits} testutfnext { @@ -693,19 +690,19 @@ test utf-7.9.2 {Tcl_UtfPrev} testutfprev { test utf-7.10.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} {testutfprev fullutf} { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev ucs4} { testutfprev A\xF2\xA0 } 1 test utf-7.10.2 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 -test utf-7.10.3 {Tcl_UtfPrev} {testutfprev fullutf} { +test utf-7.10.3 {Tcl_UtfPrev} {testutfprev ucs4} { testutfprev A\xF2\xA0\xA0\xA0 3 } 1 test utf-7.10.4 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 -test utf-7.10.5 {Tcl_UtfPrev} {testutfprev fullutf} { +test utf-7.10.5 {Tcl_UtfPrev} {testutfprev ucs4} { testutfprev A\xF2\xA0\xF8\xA0 3 } 1 test utf-7.11 {Tcl_UtfPrev} testutfprev { @@ -750,19 +747,19 @@ test utf-7.14.2 {Tcl_UtfPrev} testutfprev { test utf-7.15.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.1 {Tcl_UtfPrev} {testutfprev fullutf} { +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev ucs4} { testutfprev A\xF2\xA0\xA0 } 1 test utf-7.15.1.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 -test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { +test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev ucs4} { testutfprev A\xF2\xA0\xA0\xA0 4 } 1 test utf-7.15.2.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 -test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { +test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev ucs4} { testutfprev A\xF2\xA0\xA0\xF8 4 } 1 test utf-7.16 {Tcl_UtfPrev} testutfprev { @@ -888,19 +885,19 @@ test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs2} { testutfprev A\xF0\x90\x80\x80 } 2 -test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { +test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs4} { testutfprev A\xF0\x90\x80\x80 } 1 test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs2} { testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { +test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs4} { testutfprev A\xF0\x90\x80\x80 4 } 1 test utf-7.41.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs2} { testutfprev A\xF0\x90\x80\x80 3 } 2 -test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { +test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs4} { testutfprev A\xF0\x90\x80\x80 3 } 1 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -933,19 +930,19 @@ test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} t test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} { testutfprev A\xF4\x8F\xBF\xBF } 2 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs4} { testutfprev A\xF4\x8F\xBF\xBF } 1 test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 -test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { +test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs4} { testutfprev A\xF4\x8F\xBF\xBF 4 } 1 test utf-7.48.4 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 -test utf-7.48.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { +test utf-7.48.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs4} { testutfprev A\xF4\x8F\xBF\xBF 3 } 1 test utf-7.48.6 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { @@ -978,37 +975,37 @@ test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { } a test utf-8.2 {Tcl_UniCharAtIndex: index = 0} { string index \u4E4E\u25A 0 -} "\u4E4E" +} \u4E4E test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { string index abcd 2 } c test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { string index \u4E4E\u25A\xFF\u543 2 -} "\uFF" +} \uFF test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 { string index \uD842 0 -} "\uD842" -test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 { - string index \uD842 0 -} "\uD842" +} \uD842 +test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes ucs4} { + teststringbytes [string index \uD842 0] +} \xF0 test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {teststringbytes tip389} { teststringbytes [string index \uD842 0] } \xF0 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { string index \uDC42 0 -} "\uDC42" +} \uDC42 test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 0 -} "\uD83D" +} \uD83D test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 0 -} "\U1F600" +} \U1F600 test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {teststringbytes tip389} { teststringbytes [string index \uD83D\uDE00G 0] } \xF0 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 1 -} "\uDE00" +} \uDE00 test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 1 } G @@ -1026,10 +1023,10 @@ test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} tip389 { } G test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { string index \U1F600G 0 -} "\uFFFD" +} \uFFFD test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 0 -} "\U1F600" +} \U1F600 test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc teststringbytes tip389} { teststringbytes [string index \U1F600G 0] } \xF0 @@ -1057,22 +1054,22 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} { } abc test utf-9.2 {Tcl_UtfAtIndex: index > 0} { string range \u4E4E\u25A\xFF\u543klmnop 1 5 -} "\u25A\xFF\u543kl" +} \u25A\xFF\u543kl test utf-9.3.0 {Tcl_UtfAtIndex: index = 0, Emoji} ucs2 { string range \uD83D\uDE00G 0 0 -} "\uD83D" +} \uD83D test utf-9.3.1 {Tcl_UtfAtIndex: index = 0, Emoji} ucs4 { string range \uD83D\uDE00G 0 0 -} "\U1F600" +} \U1F600 test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} tip389 { string range \uD83D\uDE00G 0 0 -} "\U1F600" +} \U1F600 test utf-9.4.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { string range \uD83D\uDE00G 1 1 -} "\uDE00" +} \uDE00 test utf-9.4.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { string range \uD83D\uDE00G 1 1 -} "G" +} G test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 { string range \uD83D\uDE00G 1 1 } {} @@ -1087,19 +1084,19 @@ test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 { } G test utf-9.6.0 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs2} { string range \U1f600G 0 0 -} "\uFFFD" +} \uFFFD test utf-9.6.1 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs4} { string range \U1f600G 0 0 -} "\U1F600" +} \U1F600 test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc tip389} { string range \U1f600G 0 0 -} "\U1F600" +} \U1F600 test utf-9.7.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} { string range \U1f600G 1 1 } G test utf-9.7.1 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs4} { string range \U1f600G 1 1 -} "G" +} G test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc tip389} { string range \U1f600G 1 1 } {} -- cgit v0.12 From 413ea81a284c691dc5ed4ad48217370ce83f65f7 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 4 May 2020 14:17:23 +0000 Subject: More progress/simplification --- generic/tclUtf.c | 23 ++--------------------- tests/utf.test | 33 ++++++++++++--------------------- 2 files changed, 14 insertions(+), 42 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c4b5305..b964b7e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -70,25 +70,6 @@ static const unsigned char totalBytes[256] = { /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 1,1,1,1,1, -#endif - 1,1,1,1,1,1,1,1,1,1,1 -}; - -static const unsigned char complete[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -/* End of "continuation byte section" */ - 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #if TCL_UTF_MAX > 4 4,4,4,4,4, #else @@ -183,7 +164,7 @@ Invalid( unsigned char byte = *src; int index; - if (byte % 0x04) { + if ((byte & 0xC3) != 0xC0) { /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ return 0; } @@ -573,7 +554,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= complete[UCHAR(*src)]; + return length >= totalBytes[UCHAR(*src)]; } /* diff --git a/tests/utf.test b/tests/utf.test index c455078..3f74f6f 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -493,25 +493,16 @@ test utf-6.91.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} { test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { testutfnext \xF4\x90\x80\x80 } 1 -test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext ucs2} { +test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { testutfnext \xA0\xA0\xA0 -} 1 -test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext fullutf knownBug} { - testutfnext \xA0\xA0\xA0 -} 3 -test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} { - testutfnext \x80\x80\x80 } 3 -test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf knownBug} { +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { testutfnext \x80\x80\x80 } 3 -test utf-6.94.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} { - testutfnext \xA0\xA0\xA0\xA0 -} 1 -test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext fullutf knownBug} { +test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { testutfnext \xA0\xA0\xA0\xA0 } 3 -test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} { +test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { testutfnext \x80\x80\x80\x80 } 3 test utf-6.96 {Tcl_UtfNext, read limits} testutfnext { @@ -619,18 +610,18 @@ test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} { test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0 2 } 0 -test utf-6.123 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +test utf-6.123 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0G 3 -} 1 -test utf-6.124 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 3 +test utf-6.124 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0\xA0 3 -} 1 -test utf-6.125 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 3 +test utf-6.125 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0\xA0G 4 -} 1 -test utf-6.126 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 3 +test utf-6.126 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0\xA0\xA0 4 -} 1 +} 3 test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} -- cgit v0.12 From cc86f6e11e85d0a01675ec31e3677ad2e63cddc4 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 5 May 2020 13:23:17 +0000 Subject: Add 4 test-cases that could fool Tcl_UtfPrev (but ... actually they don't). Make sure that Tcl_UtfPrev() never reads more than 3 trail bytes (or 4 when TCL_UTF_MAX > 4). Those are the same limits as for Tcl_UtfNext() and Tcl_UtfToUniChar() --- generic/tclUtf.c | 2 +- tests/utf.test | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index ac87978..2439a54 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -855,7 +855,7 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < TCL_UTF_MAX); + } while (trailBytesSeen < ((TCL_UTF_MAX > 3) ? 4 : 3)); /* * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a diff --git a/tests/utf.test b/tests/utf.test index e8fa603..ffe7896 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -968,6 +968,18 @@ test utf-7.49.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} test utf-7.49.6 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { testutfprev A\xF4\x90\x80\x80 2 } 1 +test utf-7.50.0 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev ucs2} { + testutfprev \xF2\xA0\xA0\xA0\xA0 +} 2 +test utf-7.50.1 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev fullutf} { + testutfprev \xF2\xA0\xA0\xA0\xA0 +} 4 +test utf-7.51.0 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev ucs2} { + testutfprev \xF2\x80\x80\x80\x80 +} 2 +test utf-7.51.1 {Tcl_UtfPrev, 4-byte valid sequence with additional trail} {testutfprev fullutf} { + testutfprev \xF2\x80\x80\x80\x80 +} 4 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From 8fc378853391cde228bf25c1491e9ba02ebf0f2c Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 18:23:36 +0000 Subject: New approach to fixing the regression reported in [31aa44375d] builds on recent reforms. Older efforts aborted. --- generic/tclUtf.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 12eb637..8ae4b15 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -381,7 +381,7 @@ Tcl_UtfToUniChar( * characters representing themselves. */ -#if TCL_UTF_MAX <= 4 +#if TCL_UTF_MAX == 4 /* If *chPtr contains a high surrogate (produced by a previous * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation * bytes, then we must produce a follow-up low surrogate. We only @@ -437,7 +437,7 @@ Tcl_UtfToUniChar( * Four-byte-character lead byte followed by at least two trail bytes. * We don't test the validity of 3th trail byte, see [ed29806ba] */ -#if TCL_UTF_MAX <= 4 +#if TCL_UTF_MAX == 4 Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; if (high < 0x400) { @@ -446,7 +446,7 @@ Tcl_UtfToUniChar( return 1; } /* out of range, < 0x10000 or > 0x10FFFF */ -#else +#elif TCL_UTF_MAX > 4 if ((src[3] & 0xC0) == 0x80) { *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); @@ -617,25 +617,11 @@ Tcl_NumUtfChars( */ while (src <= optPtr /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { -#if TCL_UTF_MAX < 4 - if (((unsigned)UCHAR(*src) - 0xF0) < 5) { - /* treat F0 - F4 as single character */ - ch = 0; - src++; - } else -#endif src += TclUtfToUniChar(src, &ch); i++; } /* Loop over the remaining string where call must happen */ while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { -#if TCL_UTF_MAX < 4 - if (((unsigned)UCHAR(*src) - 0xF0) < 5) { - /* treat F0 - F4 as single character */ - ch = 0; - src++; - } else -#endif src += TclUtfToUniChar(src, &ch); i++; } -- cgit v0.12 From 6d36267ee03ebbf37b1843d0602220bbc299f8e9 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 18:36:25 +0000 Subject: split and constrain the failing test. --- tests/encoding.test | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/encoding.test b/tests/encoding.test index 552c97f..84f9ae1 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -335,7 +335,12 @@ test encoding-15.4 {UtfToUtfProc emoji character input} -body { set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82] list [string length $x] $y } -result "6 \uD83D\uDE02" -test encoding-15.5 {UtfToUtfProc emoji character input} { +test encoding-15.5.0 {UtfToUtfProc emoji character input} ucs2 { + set x \xF0\x9F\x98\x82 + set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] + list [string length $x] $y +} "4 \xF0\x9F\x98\x82" +test encoding-15.5.1 {UtfToUtfProc emoji character input} fullutf { set x \xF0\x9F\x98\x82 set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] list [string length $x] $y -- cgit v0.12