diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-20 15:08:58 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-20 15:08:58 (GMT) |
commit | 41af9f9e84d0b6cee2116ff08e297db05786e6ce (patch) | |
tree | cd8ee1bb9fa530176148b1651e8b9701a9f37b92 /tests/cmdAH.test | |
parent | 9f595d2fa36d13395f1bfb16559f7519c08e873f (diff) | |
download | tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.zip tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.tar.gz tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.tar.bz2 |
Add UTF16 and UTF32 tests
Diffstat (limited to 'tests/cmdAH.test')
-rw-r--r-- | tests/cmdAH.test | 193 |
1 files changed, 137 insertions, 56 deletions
diff --git a/tests/cmdAH.test b/tests/cmdAH.test index faa604a..1fbe6d2 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -185,15 +185,58 @@ set encDefaultProfile tcl8; # Should reflect the default from implementation # TODO - valid sequences for different encodings - shiftjis etc. # Note utf-16, utf-32 missing because they are automatically -# generated based on le/be versions. Also add all ranges from Unicode standard -# Table 3.7 +# generated based on le/be versions. set encValidStrings { - ascii ABC 414243 - utf-8 A\u0000\u03A9\u8A9E\U00010384 4100CEA9E8AA9EF0908E84 - utf-16le A\u0000\u03A9\u8A9E\U00010384 41000000A9039E8A00D884DF - utf-16be A\u0000\u03A9\u8A9E\U00010384 0041000003A98A9ED800DF84 - utf-32le A\u0000\u03A9\u8A9E\U00010384 4100000000000000A90300009E8A000084030100 - utf-32be A\u0000\u03A9\u8A9E\U00010384 0000004100000000000003A900008A9E00010384 + ascii \u0000 00 {} {Lowest ASCII} + ascii \u007F 7F knownBug {Highest ASCII} + + utf-8 \u0000 00 {} {Unicode Table 3.7 Row 1} + utf-8 \u007F 7F {} {Unicode Table 3.7 Row 1} + utf-8 \u0080 C280 {} {Unicode Table 3.7 Row 2} + utf-8 \u07FF DFBF {} {Unicode Table 3.7 Row 2} + utf-8 \u0800 E0A080 {} {Unicode Table 3.7 Row 3} + utf-8 \u0FFF E0BFBF {} {Unicode Table 3.7 Row 3} + utf-8 \u1000 E18080 {} {Unicode Table 3.7 Row 4} + utf-8 \uCFFF ECBFBF {} {Unicode Table 3.7 Row 4} + utf-8 \uD000 ED8080 {} {Unicode Table 3.7 Row 5} + utf-8 \uD7FF ED9FBF {} {Unicode Table 3.7 Row 5} + utf-8 \uE000 EE8080 {} {Unicode Table 3.7 Row 6} + utf-8 \uFFFF EFBFBF {} {Unicode Table 3.7 Row 6} + utf-8 \U10000 F0908080 {} {Unicode Table 3.7 Row 7} + utf-8 \U3FFFF F0BFBFBF {} {Unicode Table 3.7 Row 7} + utf-8 \U40000 F1808080 {} {Unicode Table 3.7 Row 8} + utf-8 \UFFFFF F3BFBFBF {} {Unicode Table 3.7 Row 8} + utf-8 \U100000 F4808080 {} {Unicode Table 3.7 Row 9} + utf-8 \U10FFFF F48FBFBF {} {Unicode Table 3.7 Row 9} + utf-8 A\u03A9\u8A9E\U00010384 41CEA9E8AA9EF0908E84 {} {Unicode 2.5} + + utf-16le \u0000 0000 {} {Lowest code unit} + utf-16le \uD7FF FFD7 {} {Below high surrogate range} + utf-16le \uE000 00E0 {} {Above low surrogate range} + utf-16le \uFFFF FFFF {} {Highest code unit} + utf-16le \U010000 00D800DC {} {First surrogate pair} + utf-16le \U10FFFF FFDBFFDF {} {First surrogate pair} + utf-16le A\u03A9\u8A9E\U00010384 4100A9039E8A00D884DF {} {Unicode 2.5} + + utf-16be \u0000 0000 {} {Lowest code unit} + utf-16be \uD7FF D7FF {} {Below high surrogate range} + utf-16be \uE000 E000 {} {Above low surrogate range} + utf-16be \uFFFF FFFF {} {Highest code unit} + utf-16be \U010000 D800DC00 {} {First surrogate pair} + utf-16be \U10FFFF DBFFDFFF {} {First surrogate pair} + utf-16be A\u03A9\u8A9E\U00010384 004103A98A9ED800DF84 {} {Unicode 2.5} + + utf-32le \u0000 00000000 {} {Lowest code unit} + utf-32le \uFFFF FFFF0000 {} {Highest BMP} + utf-32le \U010000 00000100 {} {First supplementary} + utf-32le \U10FFFF ffff1000 {} {Last supplementary} + utf-32le A\u03A9\u8A9E\U00010384 41000000A90300009E8A000084030100 {} {Unicode 2.5} + + utf-32be \u0000 00000000 {} {Lowest code unit} + utf-32be \uFFFF 0000FFFF {} {Highest BMP} + utf-32be \U010000 00010000 {} {First supplementary} + utf-32be \U10FFFF 0010FFFF {} {Last supplementary} + utf-32be A\u03A9\u8A9E\U00010384 00000041000003A900008A9E00010384 {} {Unicode 2.5} } # Invalid byte sequences. These are driven from a table with format @@ -211,8 +254,7 @@ set encValidStrings { # If the ctrl field is empty it is treated as all of the above # Note if there is any other value by itself, it will cause the test to # be skipped. This is intentional to skip known bugs. - -# TODO - other encodings and test cases +# TODO - non-UTF encodings # ascii - Any byte above 127 is invalid and is mapped # to the same numeric code point except for the range @@ -616,8 +658,6 @@ lappend encInvalidBytes {*}{ utf-8 F48FBFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} utf-8 F48FBFD0 strict {} 0 {} {Third trail byte must be 80:BF} - - utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} utf-8 F5 strict {} 0 {} {F5:FF are invalid everywhere} @@ -631,42 +671,73 @@ lappend encInvalidBytes {*}{ utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3.11} } -set xxencInvalidBytes { - - utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 - utf-8 \x41\x80\x42 replace A\uFFFDB -1 80 - utf-8 \x41\x80\x42 strict A 1 80 - utf-8 \x41\xC0\x80\x42 tcl8 A\u0000B -1 C080 - utf-8 \x41\xC0\x80\x42 strict A 1 C080 - utf-8 \x41\xC1\x42 tcl8 A\u00C1B -1 C1 - utf-8 \x41\xC1\x42 replace A\uFFFDB -1 C1 - utf-8 \x41\xC1\x42 strict A 1 C1 - utf-8 \x41\xC2\x42 tcl8 A\u00C2B -1 C2-nontrail - utf-8 \x41\xC2\x42 replace A\uFFFDB -1 C2-nontrail - utf-8 \x41\xC2\x42 strict A 1 C2-nontrail - utf-8 \x41\xC2 tcl8 A\u00C2 -1 C2-incomplete - utf-8 \x41\xC2 replace A\uFFFD -1 C2-incomplete - utf-8 \x41\xC2 strict A 1 C2-incomplete - utf-8 A\xed\xa0\x80B tcl8 A\uD800B -1 High-surrogate - utf-8 A\xed\xa0\x80B strict A 1 High-surrogate - utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate - utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate - utf-8 \xed\xa0\x80\xed\xb0\x80 tcl8 \U00010000 -1 High-low-surrogate - utf-8 \xed\xa0\x80\xed\xb0\x80 strict {} 0 High-low-surrogate +# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated +# based on these depending on platform endianness. Note truncated tests can only +# happen when the sequence is at the end (including by itself) Thus {solo tail} +# in some cases. +lappend encInvalidBytes {*}{ + utf-16le 41 tcl8 {} -1 {solo tail} {Truncated} + utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-16le 41 strict {} 0 {solo tail} {Truncated} + utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate} + utf-16le 00D8 replace \uFFFD -1 {knownBug} {Missing low surrogate} + utf-16le 00D8 strict {} 0 {knownBug} {Missing low surrogate} + utf-16le 00DC tcl8 \uDC00 -1 {} {Missing high surrogate} + utf-16le 00DC replace \uFFFD -1 {knownBug} {Missing high surrogate} + utf-16le 00DC strict {} 0 {knownBug} {Missing high surrogate} } -set utf32-le-TODO { - utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate} - utf-32le \x00\xD8\x00\x00 strict "" 0 {High-surrogate} - utf-32le \x00\xDC\x00\x00 tcl8 \uDC00 -1 {Low-surrogate} - utf-32le \x00\xDC\x00\x00 strict "" 0 {Low-surrogate} - utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 tcl8 \uD800\uDC00 -1 {High-low-surrogate} - utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 strict "" 0 {High-low-surrogate} - utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 tcl8 \uDC00\uD800 -1 {High-low-surrogate} - utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 strict "" 0 {High-low-surrogate} - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 tcl8 A\uD800B -1 {High-surrogate-middle} - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 strict A 4 {High-surrogate-middle} + +# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated +# based on these depending on platform endianness. Note truncated tests can only +# happen when the sequence is at the end (including by itself) Thus {solo tail} +# in some cases. +lappend encInvalidBytes {*}{ + utf-32le 41 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 41 replace \uFFFD -1 {solo} {Truncated} + utf-32le 41 strict {} 0 {solo tail} {Truncated} + utf-32le 4100 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 4100 replace \uFFFD -1 {solo} {Truncated} + utf-32le 4100 strict {} 0 {solo tail} {Truncated} + utf-32le 410000 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 410000 replace \uFFFD -1 {solo} {Truncated} + utf-32le 410000 strict {} 0 {solo tail} {Truncated} + utf-32le 00D80000 tcl8 \uD800 -1 {} {High-surrogate} + utf-32le 00D80000 replace \uFFFD -1 {} {High-surrogate} + utf-32le 00D80000 strict {} 0 {} {High-surrogate} + utf-32le 00DC0000 tcl8 \uDC00 -1 {} {Low-surrogate} + utf-32le 00DC0000 replace \uFFFD -1 {} {Low-surrogate} + utf-32le 00DC0000 strict {} 0 {} {Low-surrogate} + utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} + utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} + utf-32le 00D8000000DC0000 strict {} 0 {} {High-low-surrogate-pair} + utf-32le 00001100 tcl8 \UFFFD -1 {} {Out of range} + utf-32le 00001100 replace \UFFFD -1 {} {Out of range} + utf-32le 00001100 strict {} 0 {} {Out of range} + utf-32le FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} + utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range} + utf-32le FFFFFFFF strict {} 0 {} {Out of range} + + utf-32be 41 tcl8 {} -1 {solo tail} {Truncated} + utf-32be 0041 tcl8 {} -1 {solo tail} {Truncated} + utf-32be 000041 tcl8 {} -1 {solo tail} {Truncated} + utf-32be 0000D800 tcl8 \uD800 -1 {} {High-surrogate} + utf-32be 0000D800 replace \uFFFD -1 {} {High-surrogate} + utf-32be 0000D800 strict {} 0 {} {High-surrogate} + utf-32be 0000DC00 tcl8 \uDC00 -1 {} {Low-surrogate} + utf-32be 0000DC00 replace \uFFFD -1 {} {Low-surrogate} + utf-32be 0000DC00 strict {} 0 {} {Low-surrogate} + utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} + utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} + utf-32be 0000D8000000DC00 strict {} 0 {} {High-low-surrogate-pair} + utf-32be 00110000 tcl8 \UFFFD -1 {} {Out of range} + utf-32be 00110000 replace \UFFFD -1 {} {Out of range} + utf-32be 00110000 strict {} 0 {} {Out of range} + utf-32be FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} + utf-32be FFFFFFFF replace \UFFFD -1 {} {Out of range} + utf-32be FFFFFFFF strict {} 0 {} {Out of range} } + # Strings that cannot be encoded for specific encoding / profiles # {encoding string profile exptedresult expectedfailindex ctrl comment} # <enc,string,profile> should be unique for test ids to be unique. @@ -682,7 +753,7 @@ set utf32-le-TODO { # If the ctrl field is empty it is treated as all of the above # Note if there is any other value by itself, it will cause the test to # be skipped. This is intentional to skip known bugs. -# TODO - other encodings and test cases +# TODO - other encodings # TODO - out of range code point (note cannot be generated by \U notation) set encUnencodableStrings { ascii \u00e0 tcl8 3f -1 {} {unencodable} @@ -883,7 +954,8 @@ testconvert cmdAH-4.3.12 { } # convertfrom ?-profile? : valid byte sequences -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set prefix A set suffix B @@ -899,6 +971,7 @@ foreach {enc str hex} $encValidStrings { # convertfrom ?-profile? : invalid byte sequences foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { + if {"knownBug" in $ctrl} continue set bytes [binary format H* $hex] set prefix A set suffix B @@ -945,12 +1018,13 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } # convertfrom -failindex ?-profile? - valid data -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set prefix A set suffix B - set prefix_bytes [encoding convertto $enc A] - set suffix_bytes [encoding convertto $enc B] + set prefix_bytes [encoding convertto $enc $prefix] + set suffix_bytes [encoding convertto $enc $suffix] foreach profile $encProfiles { testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str -1] $profile testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile @@ -961,11 +1035,14 @@ foreach {enc str hex} $encValidStrings { # convertfrom -failindex ?-profile? - invalid data foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { + if {"knownBug" in $ctrl} continue # There are multiple test cases based on location of invalid bytes set bytes [binary decode hex $hex] set prefix A set suffix B - set prefixLen [string length [encoding convertto $enc $prefix]] + set prefix_bytes [encoding convertto $enc $prefix] + set suffix_bytes [encoding convertto $enc $suffix] + set prefixLen [string length $prefix_bytes] if {$ctrl eq {} || "solo" in $ctrl} { testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile } @@ -977,7 +1054,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # Failure expected set result "" } - testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $result $failidx] $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -989,7 +1066,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $result $expected_failidx] $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -1001,7 +1078,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $result $expected_failidx] $profile } } @@ -1041,7 +1118,8 @@ testconvert cmdAH-4.4.12 { # convertto ?-profile? : valid byte sequences -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A @@ -1058,6 +1136,7 @@ foreach {enc str hex} $encValidStrings { # convertto ?-profile? : invalid byte sequences foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A @@ -1105,7 +1184,8 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { } # convertto -failindex ?-profile? - valid data -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A @@ -1122,6 +1202,7 @@ foreach {enc str hex} $encValidStrings { # convertto -failindex ?-profile? - invalid data foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A |