From 9d533c3d4842c4792ffb95a166933602ccba7895 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 5 May 2020 07:29:46 +0000 Subject: Properly protect "Invalid" function against lead bytes 0x80-0xBF. This fixes "knownBug" testcase utf-6.93.1. Rename tip389 selector to utf16, since that's what it actually is, in contrast to ucs2 and ucs4. --- generic/tclUtf.c | 4 ++-- tests/utf.test | 59 ++++++++++++++++++++++++++++++++------------------------ 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 32fb743..528d5de 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -159,7 +159,7 @@ Invalid( unsigned char byte = *src; int index; - if (byte % 0x04) { + if ((byte & 0xC3) != 0xC0) { /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ return 0; } @@ -662,7 +662,7 @@ Tcl_UtfNext( } next++; } - if ((next == src + 1) || Invalid((unsigned char *)src)) { + if (Invalid((unsigned char *)src)) { return src + 1; } return next; diff --git a/tests/utf.test b/tests/utf.test index 988bede..c281d11 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -17,7 +17,7 @@ namespace path ::tcl::mathop testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] -testConstraint tip389 [expr {[string length [format %c 0x10000]] == 2}] +testConstraint utf16 [expr {[string length [format %c 0x10000]] == 2}] testConstraint ucs4 [expr {[testConstraint fullutf] && [string length [format %c 0x10000]] == 1}] @@ -108,7 +108,7 @@ test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytest test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs4} { string length [testbytestring \xF0\x90\x80\x80] } 1 -test utf-2.8.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring tip389} { +test utf-2.8.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring utf16} { string length [testbytestring \xF0\x90\x80\x80] } 2 test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytestring ucs2} { @@ -117,7 +117,7 @@ test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {testbytest test utf-2.9.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {Uesc ucs4} { string length \U10FFFF } 1 -test utf-2.9.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} tip389 { +test utf-2.9.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} utf16 { string length \uDBFF\uDFFF } 2 test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { @@ -174,7 +174,7 @@ test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars test test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring ucs4} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } 1 -test utf-4.12.2 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring tip389} { +test utf-4.12.2 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring utf16} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } 2 @@ -490,16 +490,25 @@ test utf-6.91.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} { test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { testutfnext \xF4\x90\x80\x80 } 1 -test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { +test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext ucs2} { + testutfnext \xA0\xA0\xA0 +} 1 +test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext utf16} { + testutfnext \xA0\xA0\xA0 +} 3 +test utf-6.92.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext ucs4} { testutfnext \xA0\xA0\xA0 } 1 test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} { testutfnext \x80\x80\x80 } 1 -test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf knownBug} { +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext utf16} { testutfnext \x80\x80\x80 } 3 -test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { +test utf-6.93.2 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs4} { + testutfnext \x80\x80\x80 +} 1 +test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0\xA0 } 1 test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} { @@ -610,16 +619,16 @@ test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} { test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0 2 } 1 -test utf-6.123 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.123 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0G 3 } 1 -test utf-6.124 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.124 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0\xA0 3 } 1 -test utf-6.125 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.125 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0\xA0G 4 } 1 -test utf-6.126 {Tcl_UtfNext, read limits} testutfnext { +test utf-6.126 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0\xA0\xA0 4 } 1 @@ -987,7 +996,7 @@ test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 { test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 { string index \uD842 0 } \uD842 -test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} tip389 { +test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} utf16 { string index \uD842 0 } \uD842 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { @@ -999,7 +1008,7 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 0 } \U1F600 -test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} tip389 { +test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} utf16 { string index \uD83D\uDE00G 0 } \U1F600 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { @@ -1008,7 +1017,7 @@ test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 1 } G -test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} tip389 { +test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} utf16 { string index \uD83D\uDE00G 1 } {} test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { @@ -1017,7 +1026,7 @@ test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { test utf-8.9.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 2 } {} -test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} tip389 { +test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} utf16 { string index \uD83D\uDE00G 2 } G test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { @@ -1026,7 +1035,7 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 0 } \U1F600 -test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { +test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc utf16} { string index \U1F600G 0 } \U1F600 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { @@ -1035,7 +1044,7 @@ test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 1 } G -test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { +test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc utf16} { string index \U1F600G 1 } {} test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { @@ -1044,7 +1053,7 @@ test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { test utf-8.12.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 2 } {} -test utf-8.12.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { +test utf-8.12.2 {Tcl_UniCharAtIndex: Emoji} {Uesc utf16} { string index \U1F600G 2 } G @@ -1060,7 +1069,7 @@ test utf-9.3.0 {Tcl_UtfAtIndex: index = 0, Emoji} ucs2 { test utf-9.3.1 {Tcl_UtfAtIndex: index = 0, Emoji} ucs4 { string range \uD83D\uDE00G 0 0 } \U1F600 -test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} tip389 { +test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} utf16 { string range \uD83D\uDE00G 0 0 } \U1F600 test utf-9.4.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { @@ -1069,7 +1078,7 @@ test utf-9.4.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { test utf-9.4.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { string range \uD83D\uDE00G 1 1 } G -test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 { +test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { string range \uD83D\uDE00G 1 1 } {} test utf-9.5.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { @@ -1078,7 +1087,7 @@ test utf-9.5.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { test utf-9.5.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { string range \uD83D\uDE00G 2 2 } {} -test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} tip389 { +test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { string range \uD83D\uDE00G 2 2 } G test utf-9.6.0 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs2} { @@ -1087,7 +1096,7 @@ test utf-9.6.0 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs2} { test utf-9.6.1 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc ucs4} { string range \U1f600G 0 0 } \U1F600 -test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc tip389} { +test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} {Uesc utf16} { string range \U1f600G 0 0 } \U1F600 test utf-9.7.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} { @@ -1096,7 +1105,7 @@ test utf-9.7.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} { test utf-9.7.1 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs4} { string range \U1f600G 1 1 } G -test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc tip389} { +test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc utf16} { string range \U1f600G 1 1 } {} test utf-9.8.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} { @@ -1105,7 +1114,7 @@ test utf-9.8.0 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs2} { test utf-9.8.1 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc ucs4} { string range \U1f600G 2 2 } {} -test utf-9.8.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc tip389} { +test utf-9.8.2 {Tcl_UtfAtIndex: index > 0, Emoji} {Uesc utf16} { string range \U1f600G 2 2 } G @@ -1333,7 +1342,7 @@ test utf-19.1 {TclUniCharLen} -body { unset -nocomplain foo } -result {1 4} -test utf-20.1 {TclUniCharNcmp} {ucs4} { +test utf-20.1 {TclUniCharNcmp} ucs4 { string compare [string range [format %c 0xFFFF] 0 0] [string range [format %c 0x10000] 0 0] } -1 test utf-20.2 {[4c591fa487] TclUniCharNcmp/TclUtfNcmp} { -- cgit v0.12 From 9ae5f06a2b8f68f611a804c7a2d7ee2e1f9ab759 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 5 May 2020 07:44:36 +0000 Subject: One more tip389 selector --- tests/string.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/string.test b/tests/string.test index e4c39d2..124bda7 100644 --- a/tests/string.test +++ b/tests/string.test @@ -24,7 +24,7 @@ catch [list package require -exact Tcltest [info patchlevel]] testConstraint testobj [expr {[info commands testobj] != {}}] testConstraint testindexobj [expr {[info commands testindexobj] != {}}] -testConstraint tip389 [expr {[string length \U010000] == 2}] +testConstraint utf16 [expr {[string length \U010000] == 2}] testConstraint testbytestring [llength [info commands testbytestring]] # Used for constraining memory leak tests @@ -1299,7 +1299,7 @@ test string-12.22 {string range, shimmering binary/index} { binary scan $s a* x string range $s $s end } 000000001 -test string-12.23 {string range, surrogates, bug [11ae2be95dac9417]} tip389 { +test string-12.23 {string range, surrogates, bug [11ae2be95dac9417]} utf16 { list [string range a\U100000b 1 1] [string range a\U100000b 2 2] [string range a\U100000b 3 3] } [list \U100000 {} b] -- cgit v0.12