diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-16 17:15:35 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-16 17:15:35 (GMT) |
commit | bd084c2fc97ffe2e19f0f44e23f441b89c139e9b (patch) | |
tree | 8f83258bdfebc1fc14c28f4468a5c2f50ee9144a | |
parent | d9046229bc814b561eb59c03e0aa3627264c07ea (diff) | |
download | tcl-bd084c2fc97ffe2e19f0f44e23f441b89c139e9b.zip tcl-bd084c2fc97ffe2e19f0f44e23f441b89c139e9b.tar.gz tcl-bd084c2fc97ffe2e19f0f44e23f441b89c139e9b.tar.bz2 |
Bit more work on encoding test framework. Long way to go.
-rw-r--r-- | generic/tclEncoding.c | 65 | ||||
-rw-r--r-- | tests/cmdAH.test | 427 |
2 files changed, 324 insertions, 168 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 8cd970f..470f8f3 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2368,6 +2368,7 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int ch; + int profile; result = TCL_OK; @@ -2385,8 +2386,8 @@ UtfToUtfProc( flags |= PTR2INT(clientData); dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); + profile = TCL_ENCODING_PROFILE_GET(flags); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - int profile = TCL_ENCODING_PROFILE_GET(flags); if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* @@ -2415,15 +2416,15 @@ UtfToUtfProc( (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || PROFILE_REPLACE(profile))) { /* Special sequence \xC0\x80 */ - if (PROFILE_STRICT(profile)) { - result = TCL_CONVERT_SYNTAX; - break; - } - - if (PROFILE_REPLACE(profile)) { - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - src += 1; /* C0, 80 handled in next loop iteration - since dst limit has to be checked */ + if (flags & ENCODING_INPUT) { + if (PROFILE_REPLACE(profile)) { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + src += 2; + } else { + /* PROFILE_STRICT */ + result = TCL_CONVERT_SYNTAX; + break; + } } else { /* * Convert 0xC080 to real nulls when we are in output mode, @@ -2432,6 +2433,7 @@ UtfToUtfProc( *dst++ = 0; src += 2; } + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* @@ -2516,32 +2518,37 @@ UtfToUtfProc( /* * A surrogate character is detected, handle especially. */ - /* TODO - what about REPLACE profile? */ if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } - - low = ch; - len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - - if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { - - if (PROFILE_STRICT(profile)) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; + if (0 && PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + src += len; + // dst += Tcl_UniCharToUtf(ch, dst); + } + else { + low = ch; + len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0; + + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { + + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } +cesu8: + *dst++ = (char)(((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char)(((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char)((ch | 0x80) & 0xBF); + continue; } - cesu8: - *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((ch | 0x80) & 0xBF); - continue; + src += len; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } - src += len; - dst += Tcl_UniCharToUtf(ch, dst); - ch = low; } else if (PROFILE_STRICT(profile) && (!(flags & ENCODING_INPUT)) && SURROGATE(ch)) { diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 6aa3c2e..6386658 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -187,19 +187,18 @@ set encProfiles {tcl8 strict replace} # generated based on le/be versions. Also add all ranges from Unicode standard # Table 3.7 set encValidStrings { - ascii ABC \x41\x42\x43 - utf-8 A\u0000\u03A9\u8A9E\U00010384 \x41\x00\xCE\xA9\xE8\xAA\x9E\xF0\x90\x8E\x84 - utf-16le A\u0000\u03A9\u8A9E\U00010384 \x41\x00\x00\x00\xA9\x03\x9E\x8A\x00\xD8\x84\xDF - utf-16be A\u0000\u03A9\u8A9E\U00010384 \x00\x41\x00\x00\x03\xA9\x8A\x9E\xD8\x00\xDF\x84 - utf-32le A\u0000\u03A9\u8A9E\U00010384 \x41\x00\x00\x00\x00\x00\x00\x00\xA9\x03\x00\x00\x9E\x8A\x00\x00\x84\x03\x01\x00 - utf-32be A\u0000\u03A9\u8A9E\U00010384 \x00\x00\x00\x41\x00\x00\x00\x00\x00\x00\x03\xA9\x00\x00\x8A\x9E\x00\x01\x03\x84 + ascii ABC 414243 + utf-8 A\u0000\u03A9\u8A9E\U00010384 4100CEA9E8AA9EF0908E84 + utf-16le A\u0000\u03A9\u8A9E\U00010384 41000000A9039E8A00D884DF + utf-16be A\u0000\u03A9\u8A9E\U00010384 0041000003A98A9ED800DF84 + utf-32le A\u0000\u03A9\u8A9E\U00010384 4100000000000000A90300009E8A000084030100 + utf-32be A\u0000\u03A9\u8A9E\U00010384 0000004100000000000003A900008A9E00010384 } # Invalid byte sequences. These are driven from a table with format # {encoding bytes profile expectedresult expectedfailindex ctrl comment} # -# Note tag is used in test id generation as well. The combination -# <enc,profile,tag> should be unique for test ids to be unique. Note utf-16, +# <enc,bytes,profile> should be unique for test ids to be unique. Note utf-16, # utf-32 missing because they are automatically generated based on le/be # versions. Each entry potentially results in generation of multiple tests. # This is controlled by the ctrl field. This should be a list of @@ -214,13 +213,15 @@ set encValidStrings { # TODO - other encodings and test cases -# ascii - Any byte above 127 is invalid -set encInvalidBytes { - ascii 80 default \u20AC -1 {} {map to cp1252} - ascii 80 tcl8 \u20AC -1 {} {map to cp1252} +# ascii - Any byte above 127 is invalid and is mapped +# to the same numeric code point except for the range +# 80-9F which is treated as cp1252. +# This tests the TableToUtfProc code path. +lappend encInvalidBytes {*}{ + ascii 80 default \u20AC -1 {knownBug} {map to cp1252} + ascii 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} ascii 80 replace \uFFFD -1 {} {Smallest invalid byte} ascii 80 strict {} 0 {} {Smallest invalid byte} - ascii 81 default \u0081 -1 {knownBug} {map to cp1252} ascii 82 default \u201A -1 {knownBug} {map to cp1252} ascii 83 default \u0192 -1 {knownBug} {map to cp1252} @@ -259,25 +260,80 @@ set encInvalidBytes { ascii FF strict {} 0 {} {Largest invalid byte} } -# Following invalid sequences based on Table 3.7 in the Unicode standard. -# utf-8 C0, C1, F5:FF are invalid bytes ANYWHERE. -# Exception is C080 in non-strict mode. -# +# utf-8 - valid sequences based on Table 3.7 in the Unicode +# standard. +# +# Code Points First Second Third Fourth Byte +# U+0000..U+007F 00..7F +# U+0080..U+07FF C2..DF 80..BF +# U+0800..U+0FFF E0 A0..BF 80..BF +# U+1000..U+CFFF E1..EC 80..BF 80..BF +# U+D000..U+D7FF ED 80..9F 80..BF +# U+E000..U+FFFF EE..EF 80..BF 80..BF +# U+10000..U+3FFFF F0 90..BF 80..BF 80..BF +# U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF +# U+100000..U+10FFFF F4 80..8F 80..BF 80..BF +# +# Tests below are based on the "gaps" in the above table. Note ascii test +# values are repeated because internally a different code path is used +# (UtfToUtfProc). +# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080 lappend encInvalidBytes {*}{ + utf-8 80 default \u20AC -1 {knownBug} {map to cp1252} + utf-8 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} + utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte} + utf-8 80 strict {} 0 {} {Smallest invalid byte} + utf-8 81 default \u0081 -1 {knownBug} {map to cp1252} + utf-8 82 default \u201A -1 {knownBug} {map to cp1252} + utf-8 83 default \u0192 -1 {knownBug} {map to cp1252} + utf-8 84 default \u201E -1 {knownBug} {map to cp1252} + utf-8 85 default \u2026 -1 {knownBug} {map to cp1252} + utf-8 86 default \u2020 -1 {knownBug} {map to cp1252} + utf-8 87 default \u2021 -1 {knownBug} {map to cp1252} + utf-8 88 default \u0276 -1 {knownBug} {map to cp1252} + utf-8 89 default \u2030 -1 {knownBug} {map to cp1252} + utf-8 8A default \u0160 -1 {knownBug} {map to cp1252} + utf-8 8B default \u2039 -1 {knownBug} {map to cp1252} + utf-8 8C default \u0152 -1 {knownBug} {map to cp1252} + utf-8 8D default \u008D -1 {knownBug} {map to cp1252} + utf-8 8E default \u017D -1 {knownBug} {map to cp1252} + utf-8 8F default \u008F -1 {knownBug} {map to cp1252} + utf-8 90 default \u0090 -1 {knownBug} {map to cp1252} + utf-8 91 default \u2018 -1 {knownBug} {map to cp1252} + utf-8 92 default \u2019 -1 {knownBug} {map to cp1252} + utf-8 93 default \u201C -1 {knownBug} {map to cp1252} + utf-8 94 default \u201D -1 {knownBug} {map to cp1252} + utf-8 95 default \u2022 -1 {knownBug} {map to cp1252} + utf-8 96 default \u2013 -1 {knownBug} {map to cp1252} + utf-8 97 default \u2014 -1 {knownBug} {map to cp1252} + utf-8 98 default \u02DC -1 {knownBug} {map to cp1252} + utf-8 99 default \u2122 -1 {knownBug} {map to cp1252} + utf-8 9A default \u0161 -1 {knownBug} {map to cp1252} + utf-8 9B default \u203A -1 {knownBug} {map to cp1252} + utf-8 9C default \u0153 -1 {knownBug} {map to cp1252} + utf-8 9D default \u009D -1 {knownBug} {map to cp1252} + utf-8 9E default \u017E -1 {knownBug} {map to cp1252} + utf-8 9F default \u0178 -1 {knownBug} {map to cp1252} + utf-8 C0 default \u00C0 -1 {} {C0 is invalid anywhere} utf-8 C0 tcl8 \u00C0 -1 {} {C0 is invalid anywhere} - utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} utf-8 C0 strict {} 0 {} {C0 is invalid anywhere} - + utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} utf-8 C080 default \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} - utf-8 C080 replace \uFFFD\uFFFD -1 C080 {} {C080 -> U+0 in Tcl's internal modified UTF8} - utf-8 C080 strict {} 0 {} {C080 -> U+0 in Tcl's internal modified UTF8} - + utf-8 C080 strict {} 0 {} {C080 -> invalid} + utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char} utf-8 C1 default \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} + + utf-8 C1 default \u00C1 -1 {} {Require valid trail byte} + utf-8 C1 tcl8 \u00C1 -1 {} {Require valid trail byte} + utf-8 C1 replace \uFFFD -1 {} {Require valid trail byte} + utf-8 C1 strict {} 0 {} {Require valid trail byte} + + utf-8 F5 default \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} @@ -286,14 +342,14 @@ lappend encInvalidBytes {*}{ utf-8 FF tcl8 \u00FF -1 {} {F5:FF are invalid everywhere} utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere} utf-8 FF strict {} 0 {} {F5:FF are invalid everywhere} - utf-8 F5908080 default \u00F5 -1 {knownBug} {F5:FF with trailing bytes} + + utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8} + utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownBug} {Unicode Table 3-9} + utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10} + utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownBug} {Unicode Table 3.11} } set xxencInvalidBytes { - ascii \x41\xe9\x42 default A\u00E9B -1 {non-ASCII} - ascii \x41\xe9\x42 tcl8 A\u00E9B -1 {non-ASCII} - ascii \x41\xe9\x42 replace A\uFFFDB -1 {non-ASCII} - ascii \x41\xe9\x42 strict A 1 {non-ASCII} utf-8 \x41\x80\x42 default A\u0080B -1 80 utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 @@ -343,31 +399,39 @@ set utf32-le-TODO { } # Strings that cannot be encoded for specific encoding / profiles -# {encoding string profile bytes failindex tag} -# Note tag is used in test id generation as well. The combination -# <enc,profile,tag> should be unique for test ids to be unique. +# {encoding string profile exptedresult expectedfailindex ctrl comment} +# <enc,string,profile> should be unique for test ids to be unique. # Note utf-16, utf-32 missing because they are automatically # generated based on le/be versions. +# Each entry potentially results in generation of multiple tests. +# This is controlled by the ctrl field. This should be a list of +# zero or more of the following: +# solo - the test data is the string itself +# lead - the test data is the string followed by a valid suffix +# tail - the test data is the string preceded by a prefix +# middle - the test data is the string wrapped by a prefix and suffix +# If the ctrl field is empty it is treated as all of the above +# Note if there is any other value by itself, it will cause the test to +# be skipped. This is intentional to skip known bugs. # TODO - other encodings and test cases # TODO - out of range code point (note cannot be generated by \U notation) set encUnencodableStrings { - ascii A\u00e0B default \x41\x3f\x42 -1 non-ASCII - ascii A\u00e0B tcl8 \x41\x3f\x42 -1 non-ASCII - ascii A\u00e0B strict \x41 1 non-ASCII - - iso8859-1 A\u0141B default \x41\x3f\x42 -1 unencodable - iso8859-1 A\u0141B tcl8 \x41\x3f\x42 -1 unencodable - iso8859-1 A\u0141B strict \x41 0 unencodable - - utf-8 A\uD800B default \x41\xed\xa0\x80\x42 -1 High-surrogate - utf-8 A\uD800B tcl8 \x41\xed\xa0\x80\x42 -1 High-surrogate - utf-8 A\uD800B strict \x41 1 High-surrogate - utf-8 A\uDC00B default \x41\xed\xb0\x80\x42 -1 High-surrogate - utf-8 A\uDC00B tcl8 \x41\xed\xb0\x80\x42 -1 High-surrogate - utf-8 A\uDC00B strict \x41 1 High-surrogate + ascii \u00e0 default 3f -1 {} {unencodable} + ascii \u00e0 tcl8 3f -1 {} {unencodable} + ascii \u00e0 strict {} 0 {} {unencodable} + + iso8859-1 \u0141 default 3f -1 {} unencodable + iso8859-1 \u0141 tcl8 3f -1 {} unencodable + iso8859-1 \u0141 strict {} 0 {} unencodable + + utf-8 \uD800 default eda080 -1 {} High-surrogate + utf-8 \uD800 tcl8 eda080 -1 {} High-surrogate + utf-8 \uD800 strict {} 0 {} High-surrogate + utf-8 \uDC00 default edb080 -1 {} High-surrogate + utf-8 \uDC00 tcl8 edb080 -1 {} High-surrogate + utf-8 \uDC00 strict {} 0 {} High-surrogate } - if {$::tcl_platform(byteOrder) eq "littleEndian"} { set endian le } else { @@ -437,6 +501,40 @@ proc testconvert {id body result args} { {*}$args } +proc testprofile {id converter enc profile data result args} { + if {$profile eq "default"} { + testconvert $id.$enc.$profile [list encoding $converter $enc $data] $result {*}$args + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile [list encoding $converter $enc $data] $result {*}$args + } + } else { + testconvert $id.$enc.$profile [list encoding $converter -profile $profile $enc $data] $result {*}$args + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile [list encoding $converter -profile $profile $enc $data] $result {*}$args + } + } +} + + +# Wrapper for verifying -failindex +proc testfailindex {id converter enc data result {profile default}} { + if {$profile eq "default"} { + testconvert $id.$enc.$profile "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + } + } else { + testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + } + } +} + test cmdAH-4.1.1 {encoding} -returnCodes error -body { encoding } -result {wrong # args: should be "encoding subcommand ?arg ...?"} @@ -492,42 +590,110 @@ testconvert cmdAH-4.3.12 { encoding system $system } -# Wrapper for verifying -failindex -proc testfailindex {id converter enc data result {profile default}} { - if {$profile eq "default"} { - testconvert $id.$enc "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result - if {[set enc [endianUtf $enc]] ne ""} { - # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result +# convertfrom, convertfrom -profile + +# convertfrom ?-profile? : All valid byte sequences should be accepted by all profiles +foreach {enc str hex} $encValidStrings { + set bytes [binary decode hex $hex] + set prefix A + set suffix B + set prefix_bytes [encoding convertto $enc A] + set suffix_bytes [encoding convertto $enc B] + foreach profile $encProfiles { + testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile + testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile + testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile + testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + } +} + +# convertfrom ?-profile? : invalid byte sequences +foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { + set bytes [binary format H* $hex] + set prefix A + set suffix B + set prefixLen [string length [encoding convertto $enc $prefix]] + set result [list $str] + # TODO - if the bad byte is unprintable, tcltest errors out when printing a mismatch + # so glob it out in error message pattern for now. + set errorWithoutPrefix [list "unexpected byte sequence starting at index $failidx: *" -returnCodes error -match glob] + set errorWithPrefix [list "unexpected byte sequence starting at index [expr {$failidx+$prefixLen}]: *" -returnCodes error -match glob] + if {$ctrl eq {} || "solo" in $ctrl} { + if {$failidx == -1} { + set result [list $str] + } else { + set result $errorWithoutPrefix } - } else { - testconvert $id.$enc "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result - if {[set enc [endianUtf $enc]] ne ""} { - # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + testprofile cmdAH-4.3.15.$hex.solo convertfrom $enc $profile $bytes {*}$result + } + if {$ctrl eq {} || "lead" in $ctrl} { + if {$failidx == -1} { + set result [list $str$suffix] + } else { + set result $errorWithoutPrefix + } + testprofile cmdAH-4.3.15.$hex.lead convertfrom $enc $profile $bytes$suffix {*}$result + } + if {$ctrl eq {} || "tail" in $ctrl} { + if {$failidx == -1} { + set result [list $prefix$str] + } else { + set result $errorWithPrefix + } + testprofile cmdAH-4.3.15.$hex.tail convertfrom $enc $profile $prefix$bytes {*}$result + } + if {$ctrl eq {} || "middle" in $ctrl} { + if {$failidx == -1} { + set result [list $prefix$str$suffix] + } else { + set result $errorWithPrefix } + testprofile cmdAH-4.3.15.$hex.middle convertfrom $enc $profile $prefix$bytes$suffix {*}$result } } -# -failindex - valid data -foreach {enc string bytes} $encValidStrings { - testfailindex cmdAH-4.3.13.$enc convertfrom $enc $bytes [list $string -1] - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testfailindex cmdAH-4.3.13.$enc convertfrom $enc $bytes [list $string -1] +proc printable {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } } + return $print } -# -failindex - invalid data for each profile +# convertfrom -failindex - valid data +foreach {enc str hex} $encValidStrings { + set bytes [binary decode hex $hex] + set prefix A + set suffix B + set prefix_bytes [encoding convertto $enc A] + set suffix_bytes [encoding convertto $enc B] + foreach profile $encProfiles { + testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile + testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile + testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile + testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + } +} + + +# convertfrom -failindex, convertfrom -failindex -profile, invalid data foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # There are multiple test cases based on location of invalid bytes - set bytes [binary format H* $hex] + set bytes [binary decode hex $hex] set prefix A set suffix B set prefixLen [string length [encoding convertto $enc $prefix]] if {$ctrl eq {} || "solo" in $ctrl} { - testfailindex xxcmdAH-4.3.14.$profile.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -537,7 +703,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # Failure expected set result "" } - testfailindex xxcmdAH-4.3.14.$profile.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -547,9 +713,9 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { # Failure expected set result $prefix - incr expected_failidx [string length [encoding convertto $enc $prefix]] + incr expected_failidx $prefixLen } - testfailindex xxcmdAH-4.3.14.$profile.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -559,53 +725,9 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { # Failure expected set result $prefix - incr expected_failidx [string length [encoding convertto $enc $prefix]] - } - testfailindex xxcmdAH-4.3.14.$profile.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile - } -} - -# -profile - -# All valid byte sequences should be accepted by all profiles -foreach profile $encProfiles { - set i 0 - foreach {enc string bytes} $encValidStrings { - testconvert cmdAH-4.3.15.$enc.$profile.[incr i] [list encoding convertfrom $enc $bytes] $string - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.[incr i] [list encoding convertfrom $enc $bytes] $string - } - } -} - -# Cycle through the various combinations of encodings and profiles -# for invalid byte sequences -foreach {enc hex profile prefix failidx ctrl comment} $encInvalidBytes { - set bytes [binary format H* $hex] - if {$failidx eq -1} { - set result [list $prefix] - } else { - set badbyte "'\\x[string toupper [binary encode hex [string index $bytes $failidx]]]'" - # TODO - if the bad byte is unprintable, tcltest errors out when printing a mismatch - # so glob it out for now. - set result [list "unexpected byte sequence starting at index $failidx: *" -returnCodes error -match glob] - } - if {$profile eq "default"} { - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom $enc $bytes] {*}$result - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom $enc $bytes] {*}$result - } - } else { - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom -profile $profile $enc $bytes] {*}$result - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom -profile $profile $enc $bytes] {*}$result + incr expected_failidx $prefixLen } + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile } } @@ -646,41 +768,67 @@ testconvert cmdAH-4.4.12 { # -failindex - valid data foreach {enc string bytes} $encValidStrings { testfailindex cmdAH-4.4.13.$enc convertto $enc $string [list $bytes -1] - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testfailindex cmdAH-4.4.13.$enc convertto $enc $string [list $bytes -1] - } } # -failindex - invalid data -foreach {enc string profile bytes failidx tag} $encUnencodableStrings { - testfailindex cmdAH-4.4.14.$enc.$profile.$tag convertto $enc $string [list $bytes $failidx] $profile - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testfailindex cmdAH-4.4.14.$enc.$profile.$tag convertto $enc $string [list $bytes $failidx] $profile +foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { + set bytes [binary decode hex $hex] + set prefix A + set suffix B + set prefixLen [string length [encoding convertto $enc $prefix]] + if {$ctrl eq {} || "solo" in $ctrl} { + testfailindex cmdAH-4.4.14.$string.solo convertto $enc $string [list $bytes $failidx] $profile + } + if {$ctrl eq {} || "lead" in $ctrl} { + if {$failidx == -1} { + # If success expected + set result $bytes$suffix + } else { + # Failure expected + set result "" + } + testfailindex cmdAH-4.4.14.$string.lead convertto $enc $string$suffix [list $result $failidx] $profile + } + if {$ctrl eq {} || "tail" in $ctrl} { + set expected_failidx $failidx + if {$failidx == -1} { + # If success expected + set result $prefix$bytes + } else { + # Failure expected + set result $prefix + incr expected_failidx $prefixLen + } + testfailindex cmdAH-4.4.14.$string.tail convertto $enc $prefix$string [list $result $expected_failidx] $profile + } + if {$ctrl eq {} || "middle" in $ctrl} { + set expected_failidx $failidx + if {$failidx == -1} { + # If success expected + set result $prefix$bytes$suffix + } else { + # Failure expected + set result $prefix + incr expected_failidx $prefixLen + } + testfailindex cmdAH-4.4.14.$string.middle convertto $enc $prefix$string$suffix [list $result $expected_failidx] $profile } } -# -profile +# convertto -profile # All valid byte sequences should be accepted by all profiles foreach profile $encProfiles { set i 0 foreach {enc string bytes} $encValidStrings { - testconvert cmdAH-4.4.15.$enc.$profile.[incr i] [list encoding convertto $enc $string] $bytes - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.4.15.$enc.$profile.[incr i] [list encoding convertto $enc $string] $bytes - } + testprofile cmdAH-4.4.15 convertto $enc $profile $string $bytes } } # Cycle through the various combinations of encodings and profiles # for invalid byte sequences -foreach {enc string profile bytes failidx tag} $encUnencodableStrings { +foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { + set bytes [binary decode hex $hex] if {$failidx eq -1} { set result [list $bytes] } else { @@ -688,19 +836,20 @@ foreach {enc string profile bytes failidx tag} $encUnencodableStrings { # so glob it out for now. set result [list "unexpected character at index $failidx: *" -returnCodes error -match glob] } + #testprofile xx convertto $enc $profile $string {*}$result if {$profile eq "default"} { - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result + # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result if {"utf-16$endian" eq $enc} { # utf-16le ->utf-16, utf-32be -> utf32 etc. set enc [string range $enc 0 5] - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result + # xxtestconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result } } else { - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result + # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result if {"utf-16$endian" eq $enc} { # utf-16le ->utf-16, utf-32be -> utf32 etc. set enc [string range $enc 0 5] - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result + # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result } } } |