Factor out encoding test vectors into separate file so they can be used for file IO tests

author: apnadkarni <apnmbx-wits@yahoo.com> 2023-02-24 09:35:09 (GMT)
committer: apnadkarni <apnmbx-wits@yahoo.com> 2023-02-24 09:35:09 (GMT)
commit: 854369a67c1719356d036c3fe11e052a7fe62e80 (patch)
tree: 2c2c51b218930fb40ec957e8f513e13315ceda6e /tests/encodingVectors.tcl
parent: 485bc2fd887abb2501321c670e66c849da1b026c (diff)
download: tcl-854369a67c1719356d036c3fe11e052a7fe62e80.zip
tcl-854369a67c1719356d036c3fe11e052a7fe62e80.tar.gz
tcl-854369a67c1719356d036c3fe11e052a7fe62e80.tar.bz2
1 files changed, 655 insertions, 0 deletions
diff --git a/tests/encodingVectors.tcl b/tests/encodingVectors.tcl
new file mode 100644
index 0000000..986e221
--- /dev/null
+++ b/tests/encodingVectors.tcl
@@ -0,0 +1,655 @@
+# This file contains test vectors for verifying various encodings. They are
+# stored in a common file so that they can be sourced into the various test
+# modules that are dependent on encodings. This file contains statically defined
+# test vectors. In addition, it sources the ICU-generated test vectors from
+# icuUcmTests.tcl.
+#
+# Note that sourcing the file will reinitialize any existing encoding test
+# vectors.
+#
+
+# List of defined encoding profiles
+set encProfiles {tcl8 strict replace}
+set encDefaultProfile tcl8; # Should reflect the default from implementation
+
+# encValidStrings - Table of valid strings.
+#
+# Each row is <ENCODING STR BYTES CTRL COMMENT>
+# The pair <ENCODING,STR> should be unique for generated test ids to be unique.
+# STR is a string that can be encoded in the encoding ENCODING resulting
+# in the byte sequence BYTES. The CTRL field is a list that controls test
+# generation. It may contain zero or more of `solo`, `lead`, `tail` and
+# `middle` indicating that the generated tests should include the string
+# by itself, as the lead of a longer string, as the tail of a longer string
+# and in the middle of a longer string. If CTRL is empty, it is treated as
+# containing all four of the above. The CTRL field may also contain the
+# words knownBug or knownW3C which will cause the test generation for that
+# vector to be skipped.
+#
+# utf-16, utf-32 missing because they are automatically
+# generated based on le/be versions.
+set encValidStrings {}; # Reset the table
+
+lappend encValidStrings {*}{
+    ascii    \u0000 00 {} {Lowest ASCII}
+    ascii    \u007F 7F knownBug {Highest ASCII}
+    ascii    \u007D 7D {} {Brace - just to verify test scripts are escaped correctly}
+    ascii    \u007B 7B {} {Terminating brace - just to verify test scripts are escaped correctly}
+
+    utf-8    \u0000 00 {} {Unicode Table 3.7 Row 1}
+    utf-8    \u007F 7F {} {Unicode Table 3.7 Row 1}
+    utf-8    \u0080 C280 {} {Unicode Table 3.7 Row 2}
+    utf-8    \u07FF DFBF {} {Unicode Table 3.7 Row 2}
+    utf-8    \u0800 E0A080 {} {Unicode Table 3.7 Row 3}
+    utf-8    \u0FFF E0BFBF {} {Unicode Table 3.7 Row 3}
+    utf-8    \u1000 E18080 {} {Unicode Table 3.7 Row 4}
+    utf-8    \uCFFF ECBFBF {} {Unicode Table 3.7 Row 4}
+    utf-8    \uD000 ED8080 {} {Unicode Table 3.7 Row 5}
+    utf-8    \uD7FF ED9FBF {} {Unicode Table 3.7 Row 5}
+    utf-8    \uE000 EE8080 {} {Unicode Table 3.7 Row 6}
+    utf-8    \uFFFF EFBFBF {} {Unicode Table 3.7 Row 6}
+    utf-8    \U10000 F0908080 {} {Unicode Table 3.7 Row 7}
+    utf-8    \U3FFFF F0BFBFBF {} {Unicode Table 3.7 Row 7}
+    utf-8    \U40000 F1808080 {} {Unicode Table 3.7 Row 8}
+    utf-8    \UFFFFF F3BFBFBF {} {Unicode Table 3.7 Row 8}
+    utf-8    \U100000 F4808080 {} {Unicode Table 3.7 Row 9}
+    utf-8    \U10FFFF F48FBFBF {} {Unicode Table 3.7 Row 9}
+    utf-8    A\u03A9\u8A9E\U00010384 41CEA9E8AA9EF0908E84 {} {Unicode 2.5}
+
+    utf-16le \u0000 0000 {} {Lowest code unit}
+    utf-16le \uD7FF FFD7 {} {Below high surrogate range}
+    utf-16le \uE000 00E0 {} {Above low surrogate range}
+    utf-16le \uFFFF FFFF {} {Highest code unit}
+    utf-16le \U010000 00D800DC {} {First surrogate pair}
+    utf-16le \U10FFFF FFDBFFDF {} {First surrogate pair}
+    utf-16le A\u03A9\u8A9E\U00010384 4100A9039E8A00D884DF {} {Unicode 2.5}
+
+    utf-16be \u0000 0000 {} {Lowest code unit}
+    utf-16be \uD7FF D7FF {} {Below high surrogate range}
+    utf-16be \uE000 E000 {} {Above low surrogate range}
+    utf-16be \uFFFF FFFF {} {Highest code unit}
+    utf-16be \U010000 D800DC00 {} {First surrogate pair}
+    utf-16be \U10FFFF DBFFDFFF {} {First surrogate pair}
+    utf-16be A\u03A9\u8A9E\U00010384 004103A98A9ED800DF84 {} {Unicode 2.5}
+
+    utf-32le \u0000 00000000 {} {Lowest code unit}
+    utf-32le \uFFFF FFFF0000 {} {Highest BMP}
+    utf-32le \U010000 00000100 {} {First supplementary}
+    utf-32le \U10FFFF ffff1000 {} {Last supplementary}
+    utf-32le A\u03A9\u8A9E\U00010384 41000000A90300009E8A000084030100 {} {Unicode 2.5}
+
+    utf-32be \u0000 00000000 {} {Lowest code unit}
+    utf-32be \uFFFF 0000FFFF {} {Highest BMP}
+    utf-32be \U010000 00010000 {} {First supplementary}
+    utf-32be \U10FFFF 0010FFFF {} {Last supplementary}
+    utf-32be A\u03A9\u8A9E\U00010384 00000041000003A900008A9E00010384 {} {Unicode 2.5}
+}
+
+# encInvalidBytes - Table of invalid byte sequences
+# These are byte sequences that should appear for an encoding. Each row is
+# of the form
+#    <ENCODING BYTES PROFILE EXPECTEDRESULT EXPECTEDFAILINDEX CTRL COMMENT>
+# The triple <ENCODING,BYTES,PROFILE> should be unique for test ids to be
+# unique. BYTES is a byte sequence that is invalid. EXPECTEDRESULT is the
+# expected string when the bytes are decoded using the PROFILE profile.
+# FAILINDEX gives the expected index of the invalid byte under that profile. The
+# CTRL field is a list that controls test generation. It may contain zero or
+# more of `solo`, `lead`, `tail` and `middle` indicating that the generated the
+# tail of a longer and in the middle of a longer string. If empty, it is treated
+# as containing all four of the above. The CTRL field may also contain the words
+# knownBug or knownW3C which will cause the test generation for that vector to
+# be skipped.
+#
+# utf-32 missing because they are automatically generated based on le/be
+# versions.
+set encInvalidBytes {}; # Reset the table
+
+# ascii - Any byte above 127 is invalid and is mapped
+# to the same numeric code point except for the range
+# 80-9F which is treated as cp1252.
+# This tests the TableToUtfProc code path.
+lappend encInvalidBytes {*}{
+    ascii 80 tcl8    \u20AC -1 {knownBug} {map to cp1252}
+    ascii 80 replace \uFFFD -1 {} {Smallest invalid byte}
+    ascii 80 strict  {}      0 {} {Smallest invalid byte}
+    ascii 81 tcl8    \u0081 -1 {knownBug} {map to cp1252}
+    ascii 82 tcl8    \u201A -1 {knownBug} {map to cp1252}
+    ascii 83 tcl8    \u0192 -1 {knownBug} {map to cp1252}
+    ascii 84 tcl8    \u201E -1 {knownBug} {map to cp1252}
+    ascii 85 tcl8    \u2026 -1 {knownBug} {map to cp1252}
+    ascii 86 tcl8    \u2020 -1 {knownBug} {map to cp1252}
+    ascii 87 tcl8    \u2021 -1 {knownBug} {map to cp1252}
+    ascii 88 tcl8    \u0276 -1 {knownBug} {map to cp1252}
+    ascii 89 tcl8    \u2030 -1 {knownBug} {map to cp1252}
+    ascii 8A tcl8    \u0160 -1 {knownBug} {map to cp1252}
+    ascii 8B tcl8    \u2039 -1 {knownBug} {map to cp1252}
+    ascii 8C tcl8    \u0152 -1 {knownBug} {map to cp1252}
+    ascii 8D tcl8    \u008D -1 {knownBug} {map to cp1252}
+    ascii 8E tcl8    \u017D -1 {knownBug} {map to cp1252}
+    ascii 8F tcl8    \u008F -1 {knownBug} {map to cp1252}
+    ascii 90 tcl8    \u0090 -1 {knownBug} {map to cp1252}
+    ascii 91 tcl8    \u2018 -1 {knownBug} {map to cp1252}
+    ascii 92 tcl8    \u2019 -1 {knownBug} {map to cp1252}
+    ascii 93 tcl8    \u201C -1 {knownBug} {map to cp1252}
+    ascii 94 tcl8    \u201D -1 {knownBug} {map to cp1252}
+    ascii 95 tcl8    \u2022 -1 {knownBug} {map to cp1252}
+    ascii 96 tcl8    \u2013 -1 {knownBug} {map to cp1252}
+    ascii 97 tcl8    \u2014 -1 {knownBug} {map to cp1252}
+    ascii 98 tcl8    \u02DC -1 {knownBug} {map to cp1252}
+    ascii 99 tcl8    \u2122 -1 {knownBug} {map to cp1252}
+    ascii 9A tcl8    \u0161 -1 {knownBug} {map to cp1252}
+    ascii 9B tcl8    \u203A -1 {knownBug} {map to cp1252}
+    ascii 9C tcl8    \u0153 -1 {knownBug} {map to cp1252}
+    ascii 9D tcl8    \u009D -1 {knownBug} {map to cp1252}
+    ascii 9E tcl8    \u017E -1 {knownBug} {map to cp1252}
+    ascii 9F tcl8    \u0178 -1 {knownBug} {map to cp1252}
+
+    ascii FF tcl8    \u00FF -1 {} {Largest invalid byte}
+    ascii FF replace \uFFFD -1 {} {Largest invalid byte}
+    ascii FF strict  {}      0 {} {Largest invalid byte}
+}
+
+# utf-8 - valid sequences based on Table 3.7 in the Unicode
+# standard.
+#
+# Code Points        First   Second  Third   Fourth Byte
+# U+0000..U+007F     00..7F
+# U+0080..U+07FF     C2..DF  80..BF
+# U+0800..U+0FFF     E0      A0..BF  80..BF
+# U+1000..U+CFFF     E1..EC  80..BF  80..BF
+# U+D000..U+D7FF     ED      80..9F  80..BF
+# U+E000..U+FFFF     EE..EF  80..BF  80..BF
+# U+10000..U+3FFFF   F0      90..BF  80..BF  80..BF
+# U+40000..U+FFFFF   F1..F3  80..BF  80..BF  80..BF
+# U+100000..U+10FFFF F4      80..8F  80..BF  80..BF
+#
+# Tests below are based on the "gaps" in the above table. Note ascii test
+# values are repeated because internally a different code path is used
+# (UtfToUtfProc).
+# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080
+lappend encInvalidBytes {*}{
+    utf-8 80 tcl8    \u20AC -1 {} {map to cp1252}
+    utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte}
+    utf-8 80 strict  {}      0 {} {Smallest invalid byte}
+    utf-8 81 tcl8    \u0081 -1 {} {map to cp1252}
+    utf-8 82 tcl8    \u201A -1 {} {map to cp1252}
+    utf-8 83 tcl8    \u0192 -1 {} {map to cp1252}
+    utf-8 84 tcl8    \u201E -1 {} {map to cp1252}
+    utf-8 85 tcl8    \u2026 -1 {} {map to cp1252}
+    utf-8 86 tcl8    \u2020 -1 {} {map to cp1252}
+    utf-8 87 tcl8    \u2021 -1 {} {map to cp1252}
+    utf-8 88 tcl8    \u02C6 -1 {} {map to cp1252}
+    utf-8 89 tcl8    \u2030 -1 {} {map to cp1252}
+    utf-8 8A tcl8    \u0160 -1 {} {map to cp1252}
+    utf-8 8B tcl8    \u2039 -1 {} {map to cp1252}
+    utf-8 8C tcl8    \u0152 -1 {} {map to cp1252}
+    utf-8 8D tcl8    \u008D -1 {} {map to cp1252}
+    utf-8 8E tcl8    \u017D -1 {} {map to cp1252}
+    utf-8 8F tcl8    \u008F -1 {} {map to cp1252}
+    utf-8 90 tcl8    \u0090 -1 {} {map to cp1252}
+    utf-8 91 tcl8    \u2018 -1 {} {map to cp1252}
+    utf-8 92 tcl8    \u2019 -1 {} {map to cp1252}
+    utf-8 93 tcl8    \u201C -1 {} {map to cp1252}
+    utf-8 94 tcl8    \u201D -1 {} {map to cp1252}
+    utf-8 95 tcl8    \u2022 -1 {} {map to cp1252}
+    utf-8 96 tcl8    \u2013 -1 {} {map to cp1252}
+    utf-8 97 tcl8    \u2014 -1 {} {map to cp1252}
+    utf-8 98 tcl8    \u02DC -1 {} {map to cp1252}
+    utf-8 99 tcl8    \u2122 -1 {} {map to cp1252}
+    utf-8 9A tcl8    \u0161 -1 {} {map to cp1252}
+    utf-8 9B tcl8    \u203A -1 {} {map to cp1252}
+    utf-8 9C tcl8    \u0153 -1 {} {map to cp1252}
+    utf-8 9D tcl8    \u009D -1 {} {map to cp1252}
+    utf-8 9E tcl8    \u017E -1 {} {map to cp1252}
+    utf-8 9F tcl8    \u0178 -1 {} {map to cp1252}
+
+    utf-8 C0 tcl8    \u00C0 -1 {} {C0 is invalid anywhere}
+    utf-8 C0 strict  {}      0 {} {C0 is invalid anywhere}
+    utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere}
+    utf-8 C080 tcl8    \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8}
+    utf-8 C080 strict  {}      0 {} {C080 -> invalid}
+    utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char}
+    utf-8 C0A2 tcl8    \u00C0\u00A2 -1 {} {websec.github.io - A}
+    utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A}
+    utf-8 C0A2 strict  {}            0 {} {websec.github.io - A}
+    utf-8 C0A7 tcl8    \u00C0\u00A7 -1 {} {websec.github.io - double quote}
+    utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote}
+    utf-8 C0A7 strict  {}            0 {} {websec.github.io - double quote}
+    utf-8 C0AE tcl8    \u00C0\u00AE -1 {} {websec.github.io - full stop}
+    utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop}
+    utf-8 C0AE strict  {}            0 {} {websec.github.io - full stop}
+    utf-8 C0AF tcl8    \u00C0\u00AF -1 {} {websec.github.io - solidus}
+    utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus}
+    utf-8 C0AF strict  {}            0 {} {websec.github.io - solidus}
+
+    utf-8 C1 tcl8    \u00C1 -1 {} {C1 is invalid everywhere}
+    utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere}
+    utf-8 C1 strict  {}      0 {} {C1 is invalid everywhere}
+    utf-8 C181 tcl8    \u00C1\u0081 -1 {} {websec.github.io - base test (A)}
+    utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)}
+    utf-8 C181 strict  {}            0 {} {websec.github.io - base test (A)}
+    utf-8 C19C tcl8    \u00C1\u0153 -1 {} {websec.github.io - reverse solidus}
+    utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
+    utf-8 C19C strict  {}            0 {} {websec.github.io - reverse solidus}
+
+    utf-8 C2 tcl8      \u00C2     -1 {} {Missing trail byte}
+    utf-8 C2 replace   \uFFFD     -1 {} {Missing trail byte}
+    utf-8 C2 strict    {}          0 {} {Missing trail byte}
+    utf-8 C27F tcl8    \u00C2\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 C27F strict  {}          0 {} {Trail byte must be 80:BF}
+    utf-8 DF tcl8      \u00DF     -1 {} {Missing trail byte}
+    utf-8 DF replace   \uFFFD     -1 {} {Missing trail byte}
+    utf-8 DF strict    {}          0 {} {Missing trail byte}
+    utf-8 DF7F tcl8    \u00DF\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 DF7F strict  {}          0 {} {Trail byte must be 80:BF}
+    utf-8 DFE0A080 tcl8    \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence}
+    utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence}
+    utf-8 DFE0A080 strict  {}            0 {} {Invalid trail byte is start of valid sequence}
+
+    utf-8 E0 tcl8      \u00E0     -1 {} {Missing trail byte}
+    utf-8 E0 replace   \uFFFD     -1 {} {Missing trail byte}
+    utf-8 E0 strict    {}          0 {} {Missing trail byte}
+    utf-8 E080 tcl8      \u00E0\u20AC   -1 {} {First trail byte must be A0:BF}
+    utf-8 E080 replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}
+    utf-8 E080 strict    {}              0 {} {First trail byte must be A0:BF}
+    utf-8 E0819C tcl8    \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus}
+    utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
+    utf-8 E0819C strict  {}                  0 {} {websec.github.io - reverse solidus}
+    utf-8 E09F tcl8      \u00E0\u0178   -1 {} {First trail byte must be A0:BF}
+    utf-8 E09F replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}
+    utf-8 E09F strict    {}              0 {} {First trail byte must be A0:BF}
+    utf-8 E0A0 tcl8      \u00E0\u00A0   -1 {} {Missing second trail byte}
+    utf-8 E0A0 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 E0A0 strict    {}              0 {} {Missing second trail byte}
+    utf-8 E0BF tcl8      \u00E0\u00BF   -1 {} {Missing second trail byte}
+    utf-8 E0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 E0BF strict    {}              0 {} {Missing second trail byte}
+    utf-8 E0A07F tcl8    \u00E0\u00A0\x7F   -1 {}     {Second trail byte must be 80:BF}
+    utf-8 E0A07F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 E0A07F strict  {}                  0 {}         {Second trail byte must be 80:BF}
+    utf-8 E0BF7F tcl8    \u00E0\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
+    utf-8 E0BF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 E0BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}
+
+    utf-8 E1 tcl8      \u00E1     -1 {} {Missing trail byte}
+    utf-8 E1 replace   \uFFFD     -1 {} {Missing trail byte}
+    utf-8 E1 strict    {}          0 {} {Missing trail byte}
+    utf-8 E17F tcl8    \u00E1\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 E17F strict  {}          0 {} {Trail byte must be 80:BF}
+    utf-8 E181 tcl8      \u00E1\u0081   -1 {} {Missing second trail byte}
+    utf-8 E181 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 E181 strict    {}              0 {} {Missing second trail byte}
+    utf-8 E1BF tcl8      \u00E1\u00BF   -1 {} {Missing second trail byte}
+    utf-8 E1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 E1BF strict    {}              0 {} {Missing second trail byte}
+    utf-8 E1807F tcl8    \u00E1\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
+    utf-8 E1807F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 E1807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
+    utf-8 E1BF7F tcl8    \u00E1\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
+    utf-8 E1BF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 E1BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}
+    utf-8 EC tcl8      \u00EC     -1 {} {Missing trail byte}
+    utf-8 EC replace   \uFFFD     -1 {} {Missing trail byte}
+    utf-8 EC strict    {}          0 {} {Missing trail byte}
+    utf-8 EC7F tcl8    \u00EC\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
+    utf-8 EC7F strict  {}          0 {} {Trail byte must be 80:BF}
+    utf-8 EC81 tcl8      \u00EC\u0081   -1 {} {Missing second trail byte}
+    utf-8 EC81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 EC81 strict    {}              0 {} {Missing second trail byte}
+    utf-8 ECBF tcl8      \u00EC\u00BF   -1 {} {Missing second trail byte}
+    utf-8 ECBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 ECBF strict    {}              0 {} {Missing second trail byte}
+    utf-8 EC807F tcl8    \u00EC\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
+    utf-8 EC807F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 EC807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
+    utf-8 ECBF7F tcl8    \u00EC\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
+    utf-8 ECBF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 ECBF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}
+
+    utf-8 ED tcl8       \u00ED        -1 {} {Missing trail byte}
+    utf-8 ED replace    \uFFFD        -1 {} {Missing trail byte}
+    utf-8 ED strict     {}             0 {} {Missing trail byte}
+    utf-8 ED7F tcl8     \u00ED\u7F    -1 {} {First trail byte must be 80:9F}
+    utf-8 ED7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:9F}
+    utf-8 ED7F strict   {}             0 {} {First trail byte must be 80:9F}
+    utf-8 EDA0 tcl8     \u00ED\u00A0  -1 {} {First trail byte must be 80:9F}
+    utf-8 EDA0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:9F}
+    utf-8 EDA0 strict   {}             0 {} {First trail byte must be 80:9F}
+    utf-8 ED81 tcl8      \u00ED\u0081   -1 {} {Missing second trail byte}
+    utf-8 ED81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 ED81 strict    {}              0 {} {Missing second trail byte}
+    utf-8 EDBF tcl8      \u00ED\u00BF   -1 {} {Missing second trail byte}
+    utf-8 EDBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 EDBF strict    {}              0 {} {Missing second trail byte}
+    utf-8 ED807F tcl8      \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 ED807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 ED807F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 ED9F7F tcl8      \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 ED9F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 ED9F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 EDA080 tcl8       \uD800          -1 {}  {High surrogate}
+    utf-8 EDA080 replace    \uFFFD          -1 {}  {High surrogate}
+    utf-8 EDA080 strict     {}               0 {}  {High surrogate}
+    utf-8 EDAFBF tcl8       \uDBFF          -1 {}  {High surrogate}
+    utf-8 EDAFBF replace    \uFFFD          -1 {}  {High surrogate}
+    utf-8 EDAFBF strict     {}               0 {}  {High surrogate}
+    utf-8 EDB080 tcl8       \uDC00          -1 {}  {Low surrogate}
+    utf-8 EDB080 replace    \uFFFD          -1 {}  {Low surrogate}
+    utf-8 EDB080 strict     {}               0 {}  {Low surrogate}
+    utf-8 EDBFBF tcl8       \uDFFF          -1 {}  {Low surrogate}
+    utf-8 EDBFBF replace    \uFFFD          -1 {}  {Low surrogate}
+    utf-8 EDBFBF strict     {}               0 {}  {Low surrogate}
+    utf-8 EDA080EDB080 tcl8 \U00010000      -1 {}  {High low surrogate pair}
+    utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {}  {High low surrogate pair}
+    utf-8 EDA080EDB080 strict {}             0 {}  {High low surrogate pair}
+    utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF      -1 {}  {High low surrogate pair}
+    utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {}  {High low surrogate pair}
+    utf-8 EDAFBFEDBFBF strict {}             0 {}  {High low surrogate pair}
+
+    utf-8 EE tcl8       \u00EE        -1 {} {Missing trail byte}
+    utf-8 EE replace    \uFFFD        -1 {} {Missing trail byte}
+    utf-8 EE strict     {}             0 {} {Missing trail byte}
+    utf-8 EE7F tcl8     \u00EE\u7F    -1 {} {First trail byte must be 80:BF}
+    utf-8 EE7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}
+    utf-8 EE7F strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 EED0 tcl8     \u00EE\u00D0  -1 {} {First trail byte must be 80:BF}
+    utf-8 EED0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
+    utf-8 EED0 strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 EE81 tcl8      \u00EE\u0081   -1 {} {Missing second trail byte}
+    utf-8 EE81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 EE81 strict    {}              0 {} {Missing second trail byte}
+    utf-8 EEBF tcl8      \u00EE\u00BF   -1 {} {Missing second trail byte}
+    utf-8 EEBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 EEBF strict    {}              0 {} {Missing second trail byte}
+    utf-8 EE807F tcl8      \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 EE807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 EE807F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 EEBF7F tcl8      \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 EEBF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 EEBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 EF tcl8       \u00EF        -1 {} {Missing trail byte}
+    utf-8 EF replace    \uFFFD        -1 {} {Missing trail byte}
+    utf-8 EF strict     {}             0 {} {Missing trail byte}
+    utf-8 EF7F tcl8     \u00EF\u7F    -1 {} {First trail byte must be 80:BF}
+    utf-8 EF7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}
+    utf-8 EF7F strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 EFD0 tcl8     \u00EF\u00D0  -1 {} {First trail byte must be 80:BF}
+    utf-8 EFD0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
+    utf-8 EFD0 strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 EF81 tcl8      \u00EF\u0081   -1 {} {Missing second trail byte}
+    utf-8 EF81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 EF81 strict    {}              0 {} {Missing second trail byte}
+    utf-8 EFBF tcl8      \u00EF\u00BF   -1 {} {Missing second trail byte}
+    utf-8 EFBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 EFBF strict    {}              0 {} {Missing second trail byte}
+    utf-8 EF807F tcl8      \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 EF807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 EF807F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 EFBF7F tcl8      \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 EFBF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 EFBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
+
+    utf-8 F0 tcl8       \u00F0        -1 {} {Missing trail byte}
+    utf-8 F0 replace    \uFFFD        -1 {} {Missing trail byte}
+    utf-8 F0 strict     {}             0 {} {Missing trail byte}
+    utf-8 F080 tcl8     \u00F0\u20AC  -1 {} {First trail byte must be 90:BF}
+    utf-8 F080 replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}
+    utf-8 F080 strict   {}             0 {} {First trail byte must be 90:BF}
+    utf-8 F08F tcl8     \u00F0\u8F    -1 {} {First trail byte must be 90:BF}
+    utf-8 F08F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}
+    utf-8 F08F strict   {}             0 {} {First trail byte must be 90:BF}
+    utf-8 F0D0 tcl8     \u00F0\u00D0  -1 {} {First trail byte must be 90:BF}
+    utf-8 F0D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 90:BF}
+    utf-8 F0D0 strict   {}             0 {} {First trail byte must be 90:BF}
+    utf-8 F090 tcl8      \u00F0\u0090   -1 {} {Missing second trail byte}
+    utf-8 F090 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F090 strict    {}              0 {} {Missing second trail byte}
+    utf-8 F0BF tcl8      \u00F0\u00BF   -1 {} {Missing second trail byte}
+    utf-8 F0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F0BF strict    {}              0 {} {Missing second trail byte}
+    utf-8 F0907F tcl8      \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F0907F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F0907F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F0BF7F tcl8      \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F0BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F0BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F090BF tcl8      \u00F0\u0090\u00BF   -1 {} {Missing third trail byte}
+    utf-8 F090BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F090BF strict    {}              0 {} {Missing third trail byte}
+    utf-8 F0BF81 tcl8      \u00F0\u00BF\u0081   -1 {} {Missing third trail byte}
+    utf-8 F0BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F0BF81 strict    {}              0 {} {Missing third trail byte}
+    utf-8 F0BF807F tcl8      \u00F0\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F0BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F0BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
+    utf-8 F090BFD0 tcl8      \u00F0\u0090\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F090BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F090BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
+
+    utf-8 F1 tcl8       \u00F1        -1 {} {Missing trail byte}
+    utf-8 F1 replace    \uFFFD        -1 {} {Missing trail byte}
+    utf-8 F1 strict     {}             0 {} {Missing trail byte}
+    utf-8 F17F tcl8     \u00F1\u7F    -1 {} {First trail byte must be 80:BF}
+    utf-8 F17F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}
+    utf-8 F17F strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 F1D0 tcl8     \u00F1\u00D0  -1 {} {First trail byte must be 80:BF}
+    utf-8 F1D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
+    utf-8 F1D0 strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 F180 tcl8      \u00F1\u20AC   -1 {} {Missing second trail byte}
+    utf-8 F180 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F180 strict    {}              0 {} {Missing second trail byte}
+    utf-8 F1BF tcl8      \u00F1\u00BF   -1 {} {Missing second trail byte}
+    utf-8 F1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F1BF strict    {}              0 {} {Missing second trail byte}
+    utf-8 F1807F tcl8      \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F1807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F1807F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F1BF7F tcl8      \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F1BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F1BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F180BF tcl8      \u00F1\u20AC\u00BF   -1 {} {Missing third trail byte}
+    utf-8 F180BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F180BF strict    {}              0 {} {Missing third trail byte}
+    utf-8 F1BF81 tcl8      \u00F1\u00BF\u0081   -1 {} {Missing third trail byte}
+    utf-8 F1BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F1BF81 strict    {}              0 {} {Missing third trail byte}
+    utf-8 F1BF807F tcl8      \u00F1\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F1BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F1BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
+    utf-8 F180BFD0 tcl8      \u00F1\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F180BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F180BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
+    utf-8 F3 tcl8       \u00F3        -1 {} {Missing trail byte}
+    utf-8 F3 replace    \uFFFD        -1 {} {Missing trail byte}
+    utf-8 F3 strict     {}             0 {} {Missing trail byte}
+    utf-8 F37F tcl8     \u00F3\x7F    -1 {} {First trail byte must be 80:BF}
+    utf-8 F37F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}
+    utf-8 F37F strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 F3D0 tcl8     \u00F3\u00D0  -1 {} {First trail byte must be 80:BF}
+    utf-8 F3D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
+    utf-8 F3D0 strict   {}             0 {} {First trail byte must be 80:BF}
+    utf-8 F380 tcl8      \u00F3\u20AC   -1 {} {Missing second trail byte}
+    utf-8 F380 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F380 strict    {}              0 {} {Missing second trail byte}
+    utf-8 F3BF tcl8      \u00F3\u00BF   -1 {} {Missing second trail byte}
+    utf-8 F3BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F3BF strict    {}              0 {} {Missing second trail byte}
+    utf-8 F3807F tcl8      \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F3807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F3807F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F3BF7F tcl8      \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F3BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F3BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F380BF tcl8      \u00F3\u20AC\u00BF   -1 {} {Missing third trail byte}
+    utf-8 F380BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F380BF strict    {}              0 {} {Missing third trail byte}
+    utf-8 F3BF81 tcl8      \u00F3\u00BF\u0081   -1 {} {Missing third trail byte}
+    utf-8 F3BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F3BF81 strict    {}              0 {} {Missing third trail byte}
+    utf-8 F3BF807F tcl8      \u00F3\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F3BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F3BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
+    utf-8 F380BFD0 tcl8      \u00F3\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F380BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F380BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
+
+    utf-8 F4 tcl8       \u00F4        -1 {} {Missing trail byte}
+    utf-8 F4 replace    \uFFFD        -1 {} {Missing trail byte}
+    utf-8 F4 strict     {}             0 {} {Missing trail byte}
+    utf-8 F47F tcl8     \u00F4\u7F    -1 {} {First trail byte must be 80:8F}
+    utf-8 F47F replace  \uFFFD\u7F    -1 {knownW3C} {First trail byte must be 80:8F}
+    utf-8 F47F strict   {}             0 {} {First trail byte must be 80:8F}
+    utf-8 F490 tcl8     \u00F4\u0090  -1 {} {First trail byte must be 80:8F}
+    utf-8 F490 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:8F}
+    utf-8 F490 strict   {}             0 {} {First trail byte must be 80:8F}
+    utf-8 F480 tcl8      \u00F4\u20AC   -1 {} {Missing second trail byte}
+    utf-8 F480 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F480 strict    {}              0 {} {Missing second trail byte}
+    utf-8 F48F tcl8      \u00F4\u008F   -1 {} {Missing second trail byte}
+    utf-8 F48F replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
+    utf-8 F48F strict    {}              0 {} {Missing second trail byte}
+    utf-8 F4807F tcl8      \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F4807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F4807F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F48F7F tcl8      \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF}
+    utf-8 F48F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
+    utf-8 F48F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
+    utf-8 F48081 tcl8      \u00F4\u20AC\u0081   -1 {} {Missing third trail byte}
+    utf-8 F48081 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F48081 strict    {}              0 {} {Missing third trail byte}
+    utf-8 F48F81 tcl8      \u00F4\u008F\u0081   -1 {} {Missing third trail byte}
+    utf-8 F48F81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
+    utf-8 F48F81 strict    {}              0 {} {Missing third trail byte}
+    utf-8 F481817F tcl8      \u00F4\u0081\u0081\x7F   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F480817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F480817F strict    {}              0 {} {Third trail byte must be 80:BF}
+    utf-8 F48FBFD0 tcl8      \u00F4\u008F\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
+    utf-8 F48FBFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
+    utf-8 F48FBFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
+
+    utf-8 F5 tcl8    \u00F5 -1 {} {F5:FF are invalid everywhere}
+    utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere}
+    utf-8 F5 strict  {}      0 {} {F5:FF are invalid everywhere}
+    utf-8 FF tcl8    \u00FF -1 {} {F5:FF are invalid everywhere}
+    utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere}
+    utf-8 FF strict  {}      0 {} {F5:FF are invalid everywhere}
+
+    utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8}
+    utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9}
+    utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10}
+    utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30                         -1 {knownW3C} {Unicode Table 3.11}
+}
+
+# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated
+# based on these depending on platform endianness. Note truncated tests can only
+# happen when the sequence is at the end (including by itself) Thus {solo tail}
+# in some cases.
+lappend encInvalidBytes {*}{
+    utf-16le 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
+    utf-16le 41      replace   \uFFFD -1 {solo tail} {Truncated}
+    utf-16le 41      strict    {}      0 {solo tail} {Truncated}
+    utf-16le 00D8    tcl8      \uD800 -1 {} {Missing low surrogate}
+    utf-16le 00D8    replace   \uFFFD -1 {knownBug} {Missing low surrogate}
+    utf-16le 00D8    strict    {}      0 {knownBug} {Missing low surrogate}
+    utf-16le 00DC    tcl8      \uDC00 -1 {} {Missing high surrogate}
+    utf-16le 00DC    replace   \uFFFD -1 {knownBug} {Missing high surrogate}
+    utf-16le 00DC    strict    {}      0 {knownBug} {Missing high surrogate}
+
+    utf-16be 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
+    utf-16be 41      replace   \uFFFD -1 {solo tail} {Truncated}
+    utf-16be 41      strict    {}      0 {solo tail} {Truncated}
+    utf-16be D800    tcl8      \uD800 -1 {} {Missing low surrogate}
+    utf-16be D800    replace   \uFFFD -1 {knownBug} {Missing low surrogate}
+    utf-16be D800    strict    {}      0 {knownBug} {Missing low surrogate}
+    utf-16be DC00    tcl8      \uDC00 -1 {} {Missing high surrogate}
+    utf-16be DC00    replace   \uFFFD -1 {knownBug} {Missing high surrogate}
+    utf-16be DC00    strict    {}      0 {knownBug} {Missing high surrogate}
+}
+
+# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated
+# based on these depending on platform endianness. Note truncated tests can only
+# happen when the sequence is at the end (including by itself) Thus {solo tail}
+# in some cases.
+lappend encInvalidBytes {*}{
+    utf-32le 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
+    utf-32le 41      replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32le 41      strict    {}   0 {solo tail} {Truncated}
+    utf-32le 4100    tcl8      \uFFFD  -1 {solo tail} {Truncated}
+    utf-32le 4100    replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32le 4100    strict    {}   0 {solo tail} {Truncated}
+    utf-32le 410000  tcl8      \uFFFD  -1 {solo tail} {Truncated}
+    utf-32le 410000  replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32le 410000  strict    {}       0 {solo tail} {Truncated}
+    utf-32le 00D80000 tcl8     \uD800   -1 {} {High-surrogate}
+    utf-32le 00D80000 replace  \uFFFD   -1 {} {High-surrogate}
+    utf-32le 00D80000 strict   {}        0 {} {High-surrogate}
+    utf-32le 00DC0000 tcl8     \uDC00   -1 {} {Low-surrogate}
+    utf-32le 00DC0000 replace  \uFFFD   -1 {} {Low-surrogate}
+    utf-32le 00DC0000 strict   {}        0 {} {Low-surrogate}
+    utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
+    utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
+    utf-32le 00D8000000DC0000 strict  {}            0 {} {High-low-surrogate-pair}
+    utf-32le 00001100 tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32le 00001100 replace \UFFFD -1 {} {Out of range}
+    utf-32le 00001100 strict {}       0 {} {Out of range}
+    utf-32le FFFFFFFF tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range}
+    utf-32le FFFFFFFF strict {}       0 {} {Out of range}
+
+    utf-32be 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
+    utf-32be 41      replace   \uFFFD  -1 {solo tail} {Truncated}
+    utf-32be 41      strict    {}       0 {solo tail} {Truncated}
+    utf-32be 0041    tcl8      \uFFFD  -1 {solo tail} {Truncated}
+    utf-32be 0041    replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32be 0041    strict    {}   0 {solo tail} {Truncated}
+    utf-32be 000041  tcl8      \uFFFD  -1 {solo tail} {Truncated}
+    utf-32be 000041  replace   \uFFFD  -1 {solo} {Truncated}
+    utf-32be 000041  strict    {}       0 {solo tail} {Truncated}
+    utf-32be 0000D800 tcl8     \uD800   -1 {} {High-surrogate}
+    utf-32be 0000D800 replace  \uFFFD   -1 {} {High-surrogate}
+    utf-32be 0000D800 strict   {}        0 {} {High-surrogate}
+    utf-32be 0000DC00 tcl8     \uDC00   -1 {} {Low-surrogate}
+    utf-32be 0000DC00 replace  \uFFFD   -1 {} {Low-surrogate}
+    utf-32be 0000DC00 strict   {}        0 {} {Low-surrogate}
+    utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
+    utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
+    utf-32be 0000D8000000DC00 strict  {}            0 {} {High-low-surrogate-pair}
+    utf-32be 00110000 tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32be 00110000 replace \UFFFD -1 {} {Out of range}
+    utf-32be 00110000 strict {}       0 {} {Out of range}
+    utf-32be FFFFFFFF tcl8 \UFFFD    -1 {} {Out of range}
+    utf-32be FFFFFFFF replace \UFFFD -1 {} {Out of range}
+    utf-32be FFFFFFFF strict {}       0 {} {Out of range}
+}
+
+# Strings that cannot be encoded for specific encoding / profiles
+# <ENCODING STRING PROFILE EXPECTEDRESULT EXPECTEDFAILINDEX CTRL COMMENT>
+# <ENCODING,STRING,PROFILE> should be unique for test ids to be unique.
+# See earlier comments about CTRL field.
+#
+# Note utf-16, utf-32 missing because they are automatically
+# generated based on le/be versions.
+# TODO - out of range code point (note cannot be generated by \U notation)
+lappend encUnencodableStrings {*}{
+    ascii \u00e0 tcl8    3f -1 {} {unencodable}
+    ascii \u00e0 strict  {}  0 {} {unencodable}
+
+    iso8859-1 \u0141 tcl8    3f -1 {} unencodable
+    iso8859-1 \u0141 strict  {}  0 {} unencodable
+
+    utf-8 \uD800 tcl8    eda080 -1 {} High-surrogate
+    utf-8 \uD800 strict  {}      0 {} High-surrogate
+    utf-8 \uDC00 tcl8    edb080 -1 {} High-surrogate
+    utf-8 \uDC00 strict  {}      0 {} High-surrogate
+}
+
+
+# The icuUcmTests.tcl is generated by the tools/ucm2tests.tcl script
+# and generates test vectors for the above tables for various encodings
+# based on ICU UCM files.
+# TODO - commented out for now as generating a lot of mismatches.
+# source [file join [file dirname [info script]] icuUcmTests.tcl]
author	apnadkarni <apnmbx-wits@yahoo.com>	2023-02-24 09:35:09 (GMT)
committer	apnadkarni <apnmbx-wits@yahoo.com>	2023-02-24 09:35:09 (GMT)
commit	854369a67c1719356d036c3fe11e052a7fe62e80 (patch)
tree	2c2c51b218930fb40ec957e8f513e13315ceda6e /tests/encodingVectors.tcl
parent	485bc2fd887abb2501321c670e66c849da1b026c (diff)
download	tcl-854369a67c1719356d036c3fe11e052a7fe62e80.zip tcl-854369a67c1719356d036c3fe11e052a7fe62e80.tar.gz tcl-854369a67c1719356d036c3fe11e052a7fe62e80.tar.bz2