diff options
| -rw-r--r-- | generic/tclUtf.c | 30 | ||||
| -rw-r--r-- | tests/utf.test | 54 |
2 files changed, 49 insertions, 35 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8ae4b15..b855844 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -525,12 +525,13 @@ Tcl_UtfToUniCharDString( p += TclUtfToUniChar(p, &ch); *w++ = ch; } - while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { - p += TclUtfToUniChar(p, &ch); - *w++ = ch; - } while (p < endPtr) { - *w++ = UCHAR(*p++); + if (Tcl_UtfCharComplete(p, endPtr-p)) { + p += TclUtfToUniChar(p, &ch); + *w++ = ch; + } else { + *w++ = UCHAR(*p++); + } } *w = '\0'; Tcl_DStringSetLength(dsPtr, @@ -621,17 +622,18 @@ Tcl_NumUtfChars( i++; } /* Loop over the remaining string where call must happen */ - while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += TclUtfToUniChar(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } i++; } - if (src < endPtr) { - /* - * String ends in an incomplete UTF-8 sequence. - * Count every byte in it. - */ - i += endPtr - src; - } } return i; } diff --git a/tests/utf.test b/tests/utf.test index 7b6bf1c..502aa37 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -92,8 +92,8 @@ test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring { test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestring { string length [testbytestring \xC2] } 1 -test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring { - string length [testbytestring \xC2\xA2] +test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { + string length \xA2 } 1 test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring { string length [testbytestring \xE2] @@ -139,32 +139,32 @@ test utf-3.1 {Tcl_UtfCharComplete} { test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars { testnumutfchars "" } 0 -test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC2\xA2] +test utf-4.2 {Tcl_NumUtfChars: length 1} testnumutfchars { + testnumutfchars \xA2 } 1 test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E] + testnumutfchars abc\xA2[testbytestring \xE4\xB9\x8E\xA2\x4E] } 7 -test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC0\x80] +test utf-4.4 {Tcl_NumUtfChars: #x00} testnumutfchars { + testnumutfchars \x00 } 1 test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } 0 test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC2\xA2] end + testnumutfchars \xA2 end } 1 test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E] end + testnumutfchars abc\xA2[testbytestring \xE4\xB9\x8E\xA2\x4E] end } 7 -test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC0\x80] end +test utf-4.8 {Tcl_NumUtfChars: #x00, calc len} testnumutfchars { + testnumutfchars \x00 end } 1 # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { testnumutfchars [testbytestring \xE2\x82\xAC] end-1 } 2 -test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} { +test utf-4.10 {Tcl_NumUtfChars: #x00, calc len, overcomplete} {testnumutfchars testbytestring} { testnumutfchars [testbytestring \x00] end+1 } 2 test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { @@ -179,6 +179,9 @@ test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars test test utf-4.12.2 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring ucs4} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } 1 +test utf-4.13 {Tcl_NumUtfChars: end of string} {testnumutfchars testbytestring} { + testnumutfchars foobar[testbytestring \xF2\xC2\xA0] end +} 8 test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { testfindfirst [testbytestring abcbc] 98 @@ -187,15 +190,15 @@ test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { testfindlast [testbytestring abcbc] 98 } bc -test utf-6.1 {Tcl_UtfNext} {testutfnext testbytestring} { +test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. testutfnext {} } 1 -test utf-6.2 {Tcl_UtfNext} {testutfnext testbytestring} { +test utf-6.2 {Tcl_UtfNext} testutfnext { testutfnext A } 1 -test utf-6.3 {Tcl_UtfNext} {testutfnext testbytestring} { +test utf-6.3 {Tcl_UtfNext} testutfnext { testutfnext AA } 1 test utf-6.4 {Tcl_UtfNext} {testutfnext testbytestring} { @@ -447,8 +450,8 @@ test utf-6.79.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { test utf-6.79.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { testutfnext [testbytestring \xF2\xA0\xA0\xA0G\xF8] } 4 -test utf-6.80 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { - testutfnext [testbytestring \xC0\x80] +test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \x00 } 2 test utf-6.81 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { testutfnext [testbytestring \xC0\x81] @@ -859,8 +862,8 @@ test utf-7.31 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { test utf-7.32 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xF0\x80\x80\x80] 2 } 1 -test utf-7.33 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { - testutfprev A[testbytestring \xC0\x80] +test utf-7.33 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { + testutfprev A\x00 } 1 test utf-7.34 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xC1\x80] @@ -916,7 +919,7 @@ test utf-7.46.1 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestr test utf-7.47 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {testutfprev testbytestring} { testutfprev [testbytestring \xE8\xA0] } 0 -test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {testutfprev testbytestring} { +test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} testutfprev { testutfprev \u8820 2 } 0 test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev testbytestring} { @@ -1459,7 +1462,16 @@ UniCharCaseCmpTest > [format %c 0x10000] \uFFFF ucs4 UniCharCaseCmpTest > \U10000 \uFFFF {Uesc ucs4} - +test utf-26.1 {Tcl_UniCharDString} -setup { + testobj freeallvars +} -constraints {teststringobj} -cleanup { + testobj freeallvars +} -body { + teststringobj set 1 foo + teststringobj getunicode 1 + teststringobj append 1 [testbytestring barsoom\xF2\xC2\x80] 10 + scan [string index [teststringobj get 1] 11] %c +} -result 128 unset count |
