Add UTF16 and UTF32 tests

author: apnadkarni <apnmbx-wits@yahoo.com> 2023-02-20 15:08:58 (GMT)
committer: apnadkarni <apnmbx-wits@yahoo.com> 2023-02-20 15:08:58 (GMT)
commit: 41af9f9e84d0b6cee2116ff08e297db05786e6ce (patch)
tree: cd8ee1bb9fa530176148b1651e8b9701a9f37b92 /tests/cmdAH.test
parent: 9f595d2fa36d13395f1bfb16559f7519c08e873f (diff)
download: tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.zip
tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.tar.gz
tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.tar.bz2
1 files changed, 137 insertions, 56 deletions
diff --git a/tests/cmdAH.test b/tests/cmdAH.test
index faa604a..1fbe6d2 100644
--- a/tests/cmdAH.test
+++ b/tests/cmdAH.test
@@ -185,15 +185,58 @@ set encDefaultProfile tcl8; # Should reflect the default from implementation
 
 # TODO - valid sequences for different encodings - shiftjis etc.
 # Note utf-16, utf-32 missing because they are automatically
-# generated based on le/be versions. Also add all ranges from Unicode standard
-# Table 3.7
+# generated based on le/be versions.
 set encValidStrings {
-    ascii    ABC                           414243
-    utf-8    A\u0000\u03A9\u8A9E\U00010384 4100CEA9E8AA9EF0908E84
-    utf-16le A\u0000\u03A9\u8A9E\U00010384 41000000A9039E8A00D884DF
-    utf-16be A\u0000\u03A9\u8A9E\U00010384 0041000003A98A9ED800DF84
-    utf-32le A\u0000\u03A9\u8A9E\U00010384 4100000000000000A90300009E8A000084030100
-    utf-32be A\u0000\u03A9\u8A9E\U00010384 0000004100000000000003A900008A9E00010384
+    ascii    \u0000 00 {} {Lowest ASCII}
+    ascii    \u007F 7F knownBug {Highest ASCII}
+
+    utf-8    \u0000 00 {} {Unicode Table 3.7 Row 1}
+    utf-8    \u007F 7F {} {Unicode Table 3.7 Row 1}
+    utf-8    \u0080 C280 {} {Unicode Table 3.7 Row 2}
+    utf-8    \u07FF DFBF {} {Unicode Table 3.7 Row 2}
+    utf-8    \u0800 E0A080 {} {Unicode Table 3.7 Row 3}
+    utf-8    \u0FFF E0BFBF {} {Unicode Table 3.7 Row 3}
+    utf-8    \u1000 E18080 {} {Unicode Table 3.7 Row 4}
+    utf-8    \uCFFF ECBFBF {} {Unicode Table 3.7 Row 4}
+    utf-8    \uD000 ED8080 {} {Unicode Table 3.7 Row 5}
+    utf-8    \uD7FF ED9FBF {} {Unicode Table 3.7 Row 5}
+    utf-8    \uE000 EE8080 {} {Unicode Table 3.7 Row 6}
+    utf-8    \uFFFF EFBFBF {} {Unicode Table 3.7 Row 6}
+    utf-8    \U10000 F0908080 {} {Unicode Table 3.7 Row 7}
+    utf-8    \U3FFFF F0BFBFBF {} {Unicode Table 3.7 Row 7}
+    utf-8    \U40000 F1808080 {} {Unicode Table 3.7 Row 8}
+    utf-8    \UFFFFF F3BFBFBF {} {Unicode Table 3.7 Row 8}
+    utf-8    \U100000 F4808080 {} {Unicode Table 3.7 Row 9}
+    utf-8    \U10FFFF F48FBFBF {} {Unicode Table 3.7 Row 9}
+    utf-8    A\u03A9\u8A9E\U00010384 41CEA9E8AA9EF0908E84 {} {Unicode 2.5}
+
+    utf-16le \u0000 0000 {} {Lowest code unit}
+    utf-16le \uD7FF FFD7 {} {Below high surrogate range}
+    utf-16le \uE000 00E0 {} {Above low surrogate range}
+    utf-16le \uFFFF FFFF {} {Highest code unit}
+    utf-16le \U010000 00D800DC {} {First surrogate pair}
+    utf-16le \U10FFFF FFDBFFDF {} {First surrogate pair}
+    utf-16le A\u03A9\u8A9E\U00010384 4100A9039E8A00D884DF {} {Unicode 2.5}
+
+    utf-16be \u0000 0000 {} {Lowest code unit}
+    utf-16be \uD7FF D7FF {} {Below high surrogate range}
+    utf-16be \uE000 E000 {} {Above low surrogate range}
+    utf-16be \uFFFF FFFF {} {Highest code unit}
+    utf-16be \U010000 D800DC00 {} {First surrogate pair}
+    utf-16be \U10FFFF DBFFDFFF {} {First surrogate pair}
+    utf-16be A\u03A9\u8A9E\U00010384 004103A98A9ED800DF84 {} {Unicode 2.5}
+
+    utf-32le \u0000 00000000 {} {Lowest code unit}
+    utf-32le \uFFFF FFFF0000 {} {Highest BMP}
+    utf-32le \U010000 00000100 {} {First supplementary}
+    utf-32le \U10FFFF ffff1000 {} {Last supplementary}
+    utf-32le A\u03A9\u8A9E\U00010384 41000000A90300009E8A000084030100 {} {Unicode 2.5}
+
+    utf-32be \u0000 00000000 {} {Lowest code unit}
+    utf-32be \uFFFF 0000FFFF {} {Highest BMP}
+    utf-32be \U010000 00010000 {} {First supplementary}
+    utf-32be \U10FFFF 0010FFFF {} {Last supplementary}
+    utf-32be A\u03A9\u8A9E\U00010384 00000041000003A900008A9E00010384 {} {Unicode 2.5}
 }
 
 # Invalid byte sequences. These are driven from a table with format
@@ -211,8 +254,7 @@ set encValidStrings {
 # If the ctrl field is empty it is treated as all of the above
 # Note if there is any other value by itself, it will cause the test to
 # be skipped. This is intentional to skip known bugs.
-
-# TODO - other encodings and test cases
+# TODO - non-UTF encodings
 
 # ascii - Any byte above 127 is invalid and is mapped
 # to the same numeric code point except for the range
@@ -616,8 +658,6 @@ lappend encInvalidBytes {*}{
     utf-8 F48FBFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
     utf-8 F48FBFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
 
-
-
     utf-8 F5 tcl8    \u00F5 -1 {} {F5:FF are invalid everywhere}
     utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere}
     utf-8 F5 strict  {}      0 {} {F5:FF are invalid everywhere}
@@ -631,42 +671,73 @@ lappend encInvalidBytes {*}{
     utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30                         -1 {knownW3C} {Unicode Table 3.11}
 }
 
-set xxencInvalidBytes {
-    
-    utf-8 \x41\x80\x42     tcl8      A\u0080B -1 80
-    utf-8 \x41\x80\x42     replace   A\uFFFDB -1 80
-    utf-8 \x41\x80\x42     strict    A         1 80
-    utf-8 \x41\xC0\x80\x42 tcl8      A\u0000B -1 C080
-    utf-8 \x41\xC0\x80\x42 strict    A         1 C080
-    utf-8 \x41\xC1\x42     tcl8      A\u00C1B -1 C1
-    utf-8 \x41\xC1\x42     replace   A\uFFFDB -1 C1
-    utf-8 \x41\xC1\x42     strict    A         1 C1
-    utf-8 \x41\xC2\x42     tcl8      A\u00C2B -1 C2-nontrail
-    utf-8 \x41\xC2\x42     replace   A\uFFFDB -1 C2-nontrail
-    utf-8 \x41\xC2\x42     strict    A         1 C2-nontrail
-    utf-8 \x41\xC2         tcl8      A\u00C2  -1 C2-incomplete
-    utf-8 \x41\xC2         replace   A\uFFFD  -1 C2-incomplete
-    utf-8 \x41\xC2         strict    A         1 C2-incomplete
-    utf-8 A\xed\xa0\x80B   tcl8      A\uD800B -1 High-surrogate
-    utf-8 A\xed\xa0\x80B   strict    A         1 High-surrogate
-    utf-8 A\xed\xb0\x80B   tcl8      A\uDC00B -1 Low-surrogate
-    utf-8 A\xed\xb0\x80B   strict    A         1 Low-surrogate
-    utf-8 \xed\xa0\x80\xed\xb0\x80   tcl8      \U00010000 -1 High-low-surrogate
-    utf-8 \xed\xa0\x80\xed\xb0\x80   strict    {}          0 High-low-surrogate
+# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated
+# based on these depending on platform endianness. Note truncated tests can only
+# happen when the sequence is at the end (including by itself) Thus {solo tail}
+# in some cases.
+lappend encInvalidBytes {*}{
+    utf-16le 41      tcl8      {}  -1 {solo tail} {Truncated}
+    utf-16le 41      replace   \uFFFD  -1 {solo tail} {Truncated}
+    utf-16le 41      strict    {}   0 {solo tail} {Truncated}
+    utf-16le 00D8    tcl8      \uD800 -1 {} {Missing low surrogate}
+    utf-16le 00D8    replace   \uFFFD -1 {knownBug} {Missing low surrogate}
+    utf-16le 00D8    strict    {}      0 {knownBug} {Missing low surrogate}
+    utf-16le 00DC    tcl8      \uDC00 -1 {} {Missing high surrogate}
+    utf-16le 00DC    replace   \uFFFD -1 {knownBug} {Missing high surrogate}
+    utf-16le 00DC    strict    {}      0 {knownBug} {Missing high surrogate}
 }
-set utf32-le-TODO {
-    utf-32le \x00\xD8\x00\x00                                   tcl8     \uD800   -1 {High-surrogate}
-    utf-32le \x00\xD8\x00\x00                                   strict   ""        0 {High-surrogate}
-    utf-32le \x00\xDC\x00\x00                                   tcl8     \uDC00   -1 {Low-surrogate}
-    utf-32le \x00\xDC\x00\x00                                   strict   ""        0 {Low-surrogate}
-    utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00                   tcl8     \uD800\uDC00  -1 {High-low-surrogate}
-    utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00                   strict   ""             0 {High-low-surrogate}
-    utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00                   tcl8     \uDC00\uD800  -1 {High-low-surrogate}
-    utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00                   strict   ""             0 {High-low-surrogate}
-    utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00   tcl8     A\uD800B -1 {High-surrogate-middle}
-    utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00   strict   A         4 {High-surrogate-middle}
+
+# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated
+# based on these depending on platform endianness. Note truncated tests can only
+# happen when the sequence is at the end (including by itself) Thus {solo tail}
+# in some cases.
+lappend encInvalidBytes {*}{
+    utf-32le 41      tcl8      {}  -1 {solo tail} {Truncated}
+    utf-32le 41      replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32le 41      strict    {}   0 {solo tail} {Truncated}
+    utf-32le 4100    tcl8      {}  -1 {solo tail} {Truncated}
+    utf-32le 4100    replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32le 4100    strict    {}   0 {solo tail} {Truncated}
+    utf-32le 410000  tcl8      {}  -1 {solo tail} {Truncated}
+    utf-32le 410000  replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32le 410000  strict    {}   0 {solo tail} {Truncated}
+    utf-32le 00D80000 tcl8     \uD800   -1 {} {High-surrogate}
+    utf-32le 00D80000 replace  \uFFFD   -1 {} {High-surrogate}
+    utf-32le 00D80000 strict   {}        0 {} {High-surrogate}
+    utf-32le 00DC0000 tcl8     \uDC00   -1 {} {Low-surrogate}
+    utf-32le 00DC0000 replace  \uFFFD   -1 {} {Low-surrogate}
+    utf-32le 00DC0000 strict   {}        0 {} {Low-surrogate}
+    utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
+    utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
+    utf-32le 00D8000000DC0000 strict  {}            0 {} {High-low-surrogate-pair}
+    utf-32le 00001100 tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32le 00001100 replace \UFFFD -1 {} {Out of range}
+    utf-32le 00001100 strict {}       0 {} {Out of range}
+    utf-32le FFFFFFFF tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range}
+    utf-32le FFFFFFFF strict {}       0 {} {Out of range}
+
+    utf-32be 41      tcl8      {}  -1 {solo tail} {Truncated}
+    utf-32be 0041    tcl8      {}  -1 {solo tail} {Truncated}
+    utf-32be 000041  tcl8      {}  -1 {solo tail} {Truncated}
+    utf-32be 0000D800 tcl8     \uD800   -1 {} {High-surrogate}
+    utf-32be 0000D800 replace  \uFFFD   -1 {} {High-surrogate}
+    utf-32be 0000D800 strict   {}        0 {} {High-surrogate}
+    utf-32be 0000DC00 tcl8     \uDC00   -1 {} {Low-surrogate}
+    utf-32be 0000DC00 replace  \uFFFD   -1 {} {Low-surrogate}
+    utf-32be 0000DC00 strict   {}        0 {} {Low-surrogate}
+    utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
+    utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
+    utf-32be 0000D8000000DC00 strict  {}            0 {} {High-low-surrogate-pair}
+    utf-32be 00110000 tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32be 00110000 replace \UFFFD -1 {} {Out of range}
+    utf-32be 00110000 strict {}       0 {} {Out of range}
+    utf-32be FFFFFFFF tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32be FFFFFFFF replace \UFFFD -1 {} {Out of range}
+    utf-32be FFFFFFFF strict {}       0 {} {Out of range}
 }
 
+
 # Strings that cannot be encoded for specific encoding / profiles
 # {encoding string profile exptedresult expectedfailindex ctrl comment}
 # <enc,string,profile> should be unique for test ids to be unique.
@@ -682,7 +753,7 @@ set utf32-le-TODO {
 # If the ctrl field is empty it is treated as all of the above
 # Note if there is any other value by itself, it will cause the test to
 # be skipped. This is intentional to skip known bugs.
-# TODO - other encodings and test cases
+# TODO - other encodings
 # TODO - out of range code point (note cannot be generated by \U notation)
 set encUnencodableStrings {
     ascii \u00e0 tcl8    3f -1 {} {unencodable}
@@ -883,7 +954,8 @@ testconvert cmdAH-4.3.12 {
 }
 
 # convertfrom ?-profile? : valid byte sequences
-foreach {enc str hex} $encValidStrings {
+foreach {enc str hex ctrl comment} $encValidStrings {
+    if {"knownBug" in $ctrl} continue
     set bytes [binary decode hex $hex]
     set prefix A
     set suffix B
@@ -899,6 +971,7 @@ foreach {enc str hex} $encValidStrings {
 
 # convertfrom ?-profile? : invalid byte sequences
 foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
+    if {"knownBug" in $ctrl} continue
     set bytes [binary format H* $hex]
     set prefix A
     set suffix B
@@ -945,12 +1018,13 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
 }
 
 # convertfrom -failindex ?-profile? - valid data
-foreach {enc str hex} $encValidStrings {
+foreach {enc str hex ctrl comment} $encValidStrings {
+    if {"knownBug" in $ctrl} continue
     set bytes [binary decode hex $hex]
     set prefix A
     set suffix B
-    set prefix_bytes [encoding convertto $enc A]
-    set suffix_bytes [encoding convertto $enc B]
+    set prefix_bytes [encoding convertto $enc $prefix]
+    set suffix_bytes [encoding convertto $enc $suffix]
     foreach profile $encProfiles {
         testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str -1] $profile
         testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile
@@ -961,11 +1035,14 @@ foreach {enc str hex} $encValidStrings {
 
 # convertfrom -failindex ?-profile? - invalid data
 foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
+    if {"knownBug" in $ctrl} continue
     # There are multiple test cases based on location of invalid bytes
     set bytes [binary decode hex $hex]
     set prefix A
     set suffix B
-    set prefixLen [string length [encoding convertto $enc $prefix]]
+    set prefix_bytes [encoding convertto $enc $prefix]
+    set suffix_bytes [encoding convertto $enc $suffix]
+    set prefixLen [string length $prefix_bytes]
     if {$ctrl eq {} || "solo" in $ctrl} {
         testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile
     }
@@ -977,7 +1054,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
             # Failure expected
             set result ""
         }
-        testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile
+        testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $result $failidx] $profile
     }
     if {$ctrl eq {} || "tail" in $ctrl} {
         set expected_failidx $failidx
@@ -989,7 +1066,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
             set result $prefix
             incr expected_failidx $prefixLen
         }
-        testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile
+        testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $result $expected_failidx] $profile
     }
     if {$ctrl eq {} || "middle" in $ctrl} {
         set expected_failidx $failidx
@@ -1001,7 +1078,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
             set result $prefix
             incr expected_failidx $prefixLen
         }
-        testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile
+        testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $result $expected_failidx] $profile
     }
 }
 
@@ -1041,7 +1118,8 @@ testconvert cmdAH-4.4.12 {
 
 # convertto ?-profile? : valid byte sequences
 
-foreach {enc str hex} $encValidStrings {
+foreach {enc str hex ctrl comment} $encValidStrings {
+    if {"knownBug" in $ctrl} continue
     set bytes [binary decode hex $hex]
     set printable [printable $str]
     set prefix A
@@ -1058,6 +1136,7 @@ foreach {enc str hex} $encValidStrings {
 
 # convertto ?-profile? : invalid byte sequences
 foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings {
+    if {"knownBug" in $ctrl} continue
     set bytes [binary decode hex $hex]
     set printable [printable $str]
     set prefix A
@@ -1105,7 +1184,8 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings {
 }
 
 # convertto -failindex ?-profile? - valid data
-foreach {enc str hex} $encValidStrings {
+foreach {enc str hex ctrl comment} $encValidStrings {
+    if {"knownBug" in $ctrl} continue
     set bytes [binary decode hex $hex]
     set printable [printable $str]
     set prefix A
@@ -1122,6 +1202,7 @@ foreach {enc str hex} $encValidStrings {
 
 # convertto -failindex ?-profile? - invalid data
 foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings {
+    if {"knownBug" in $ctrl} continue
     set bytes [binary decode hex $hex]
     set printable [printable $str]
     set prefix A
author	apnadkarni <apnmbx-wits@yahoo.com>	2023-02-20 15:08:58 (GMT)
committer	apnadkarni <apnmbx-wits@yahoo.com>	2023-02-20 15:08:58 (GMT)
commit	41af9f9e84d0b6cee2116ff08e297db05786e6ce (patch)
tree	cd8ee1bb9fa530176148b1651e8b9701a9f37b92 /tests/cmdAH.test
parent	9f595d2fa36d13395f1bfb16559f7519c08e873f (diff)
download	tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.zip tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.tar.gz tcl-41af9f9e84d0b6cee2116ff08e297db05786e6ce.tar.bz2