blob: d5cec23363b4ecf640b29bf43e8a7fad539fcf9b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
|
# This file contains a collection of tests for Tcl_UtfToExternal and
# Tcl_UtfToExternal that exercise various combinations of flags,
# buffer lengths and fragmentation that cannot be tested by
# normal script level commands. There tests are NOT intended to check
# correct encodings; those are elsewhere.
#
# Copyright (c) 2023 Ashok P. Nadkarni
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
if {"::tcltest" ni [namespace children]} {
package require tcltest 2.5
namespace import -force ::tcltest::*
}
::tcltest::loadTestedCommands
catch [list package require -exact tcl::test [info patchlevel]]
testConstraint testbytestring [llength [info commands testbytestring]]
testConstraint testencoding [llength [info commands testencoding]]
namespace eval utftest {
# Format of table, indexed by encoding. The encodings are not exhaustive
# but one of each kind of encoding transform (algorithmic, table-driven,
# stateful, DBCS, MBCS).
# Each element is list of lists. Nested lists have following fields
# 0 comment (no spaces, might be used to generate id's as well)
# The combination of comment and internal hex (2) should be unique.
# 1 hex representation of internal *modified* utf-8 encoding. This is the
# source string for Tcl_UtfToExternal and expected result for
# Tcl_ExternalToUtf.
# 2 hex representation in specified encoding. This is the source string for
# Tcl_ExternalToUtf and expected result for Tcl_UtfToExternal.
# 3 internal fragmentation index - where to split field 1 for fragmentation
# tests. -1 to skip
# 4 external fragmentation index - where to split field 2 for fragmentation
# tests. -1 to skip
#
# THE HEX DEFINITIONS SHOULD SEPARATE EACH CHARACTER BY WHITESPACE
# (assumed by the charlimit tests)
lappend utfExtMap {*}{
ascii {
{basic {41 42 43} {41 42 43} -1 -1}
}
utf-8 {
{bmp {41 c3a9 42} {41 c3a9 42} 2 2}
{nonbmp-frag-1 {41 f09f9880 42} {41 f09f9880 42} 2 2}
{nonbmp-frag-2 {41 f09f9880 42} {41 f09f9880 42} 3 3}
{nonbmp-frag-3 {41 f09f9880 42} {41 f09f9880 42} 4 4}
{null {41 c080 42} {41 00 42} 2 -1}
}
cesu-8 {
{bmp {41 c3a9 42} {41 c3a9 42} 2 2}
{nonbmp-frag-surr-low {41 f09f9880 42} {41 eda0bd edb880 42} 2 2}
{nonbmp-split-surr {41 f09f9880 42} {41 eda0bd edb880 42} 3 -1}
{nonbmp-frag-surr-high {41 f09f9880 42} {41 eda0bd edb880 42} 4 6}
{null {41 c080 42} {41 00 42} 2 -1}
}
utf-16le {
{bmp {41 c3a9 42} {4100 e900 4200} 2 3}
{nonbmp {41 f09f9880 42} {4100 3dd8 00de 4200} 4 3}
{split-surrogate {41 f09f9080 42} {4100 3dd8 00dc 4200} 3 4}
{null {41 c080 42} {4100 0000 4200} 2 3}
}
utf-16be {
{bmp {41 c3a9 42} {0041 00e9 0042} 2 3}
{nonbmp {41 f09f9880 42} {0041 d83d de00 0042} 4 3}
{split-surrogate {41 f09f9080 42} {0041 d83d dc00 0042} 3 4}
{null {41 c080 42} {0041 0000 0042} 2 3}
}
utf-32le {
{bmp {41 c3a9 42} {41000000 e9000000 42000000} 2 3}
{nonbmp {41 f09f9880 42} {41000000 00f60100 42000000} 4 6}
{null {41 c080 42} {41000000 00000000 42000000} 2 3}
}
utf-32be {
{bmp {41 c3a9 42} {00000041 000000e9 00000042} 2 3}
{nonbmp {41 f09f9880 42} {00000041 0001f600 00000042} 4 3}
{null {41 c080 42} {00000041 00000000 00000042} 2 3}
}
iso8859-1 {
{basic {41 c3a9 42} {41 e9 42} 2 -1}
{null {41 c080 42} {41 00 42} 2 -1}
}
iso8859-3 {
{basic {41 c4a0 42} {41 d5 42} 2 -1}
{null {41 c080 42} {41 00 42} 2 -1}
}
shiftjis {
{basic {41 e4b98e 42} {41 8cc1 42} 3 2}
}
jis0208 {
{basic {e4b98e e590be} {3843 3863} 1 1}
}
iso2022-jp {
{frag-in-leadescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 2}
{frag-in-char {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 5}
{frag-in-trailescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 8}
}
}
# Return a binary string containing nul terminator for encoding
proc hexnuls {enc} {
return [binary encode hex [encoding convertto $enc \x00]]
}
# The C wrapper fills entire destination buffer with FF.
# Anything beyond expected output should have FF's
proc fill {bin buflen} {
return [string range "$bin[string repeat \xFF $buflen]" 0 $buflen-1]
}
proc testutf {direction enc comment hexin hexout args} {
set id $comment-[join $hexin ""]
if {$direction eq "toutf"} {
set cmd Tcl_ExternalToUtf
} else {
set cmd Tcl_UtfToExternal
}
set in [binary decode hex $hexin]
set out [binary decode hex $hexout]
set dstlen 40 ;# Should be enough for all encoding tests
set status ok
set flags [list start end]
set constraints [list testencoding]
set profiles [encoding profiles]
while {[llength $args] > 1} {
set opt [lpop args 0]
switch $opt {
-flags { set flags [lpop args 0] }
-constraints { lappend constraints {*}[lpop args 0] }
-profiles { set profiles [lpop args 0] }
-status { set status [lpop args 0]}
default {
error "Unknown option \"$opt\""
}
}
}
if {[llength $args]} {
error "No value supplied for option [lindex $args 0]."
}
set result [list $status {} [fill $out $dstlen]]
test $cmd-$enc-$id-[join $flags -] "$cmd - $enc - $hexin - $flags" -body \
[list testencoding $cmd $enc $in $flags {} $dstlen] \
-result $result -constraints $constraints
foreach profile $profiles {
set flags2 [linsert $flags end $profile]
test $cmd-$enc-$id-[join $flags2 -] "$cmd - $enc - $hexin - $flags2" -body \
[list testencoding $cmd $enc $in $flags2 {} $dstlen] \
-result $result -constraints $constraints
}
}
proc testfragment {direction enc comment hexin hexout fragindex args} {
if {$fragindex < 0} {
# Single byte encodings so no question of fragmentation
return
}
set id $comment-[join $hexin ""]-fragment
if {$direction eq "toutf"} {
set cmd Tcl_ExternalToUtf
} else {
set cmd Tcl_UtfToExternal
}
set status1 multibyte; # Return status to expect after first call
while {[llength $args] > 1} {
set opt [lpop args 0]
switch $opt {
-status1 { set status1 [lpop args 0]}
default {
error "Unknown option \"$opt\""
}
}
}
set in [binary decode hex $hexin]
set infrag [string range $in 0 $fragindex-1]
set out [binary decode hex $hexout]
set dstlen 40 ;# Should be enough for all encoding tests
test $cmd-$enc-$id "$cmd - $enc - $hexin - frag" -constraints testencoding -body {
set frag1Result [testencoding $cmd $enc [string range $in 0 $fragindex-1] {start} 0 $dstlen frag1Read frag1Written]
lassign $frag1Result frag1Status frag1State frag1Decoded
set frag2Result [testencoding $cmd $enc [string range $in $frag1Read end] {end} $frag1State $dstlen frag2Read frag2Written]
lassign $frag2Result frag2Status frag2State frag2Decoded
set decoded [string cat [string range $frag1Decoded 0 $frag1Written-1] [string range $frag2Decoded 0 $frag2Written-1]]
list $frag1Status [expr {$frag1Read <= $fragindex}] \
$frag2Status [expr {$frag1Read+$frag2Read}] \
[expr {$frag1Written+$frag2Written}] $decoded
} -result [list $status1 1 ok [string length $in] [string length $out] $out]
}
proc testcharlimit {direction enc comment hexin hexout} {
set id $comment-[join $hexin ""]-charlimit
if {$direction eq "toutf"} {
set cmd Tcl_ExternalToUtf
} else {
set cmd Tcl_UtfToExternal
}
set maxchars [llength $hexout]
set in [binary decode hex $hexin]
set out [binary decode hex $hexout]
set dstlen 40 ;# Should be enough for all encoding tests
for {set nchars 0} {$nchars <= $maxchars} {incr nchars} {
set expected_bytes [binary decode hex [lrange $hexout 0 $nchars-1]]
set expected_nwritten [string length $expected_bytes]
test $cmd-$enc-$id-$nchars "$cmd - $enc - $hexin - nchars $nchars" -constraints testencoding -body {
set charlimit $nchars
lassign [testencoding $cmd $enc $in \
{start end charlimit} 0 $dstlen nread nwritten charlimit] \
status state buf
list $status $nwritten [string range $buf 0 $nwritten-1]
} -result [list [expr {$nchars == $maxchars ? "ok" : "nospace"}] $expected_nwritten $expected_bytes]
}
}
#
# Basic tests
foreach {enc testcases} $utfExtMap {
foreach testcase $testcases {
lassign $testcase {*}{comment utfhex hex internalfragindex externalfragindex}
# Basic test - TCL_ENCODING_START|TCL_ENCODING_END
# Note by default output should be terminated with \0
set encnuls [hexnuls $enc]
testutf toutf $enc $comment $hex ${utfhex}00
testutf fromutf $enc $comment $utfhex $hex$encnuls
# Test TCL_ENCODING_NO_TERMINATE
testutf toutf $enc $comment $hex $utfhex -flags {start end noterminate}
# noterminate is specific to ExternalToUtf,
# should have no effect in other direction
testutf fromutf $enc $comment $utfhex $hex$encnuls -flags {start end noterminate}
# Fragments
testfragment toutf $enc $comment $hex $utfhex $externalfragindex
testfragment fromutf $enc $comment $utfhex $hex $internalfragindex
# Char limits - note no fromutf as Tcl_UtfToExternal does not support it
testcharlimit toutf $enc $comment $hex $utfhex
}
}
# Special cases - cesu2 high and low surrogates in separate fragments
# This will (correctly) return "ok", not "multibyte" after first frag
testfragment toutf cesu-8 nonbmp-split-surr \
{41 eda0bd edb880 42} {41 f09f9880 42} 4 -status1 ok
# Bug regression tests
test Tcl_UtfToExternal-bug-183a1adcc0 {buffer overflow} -body {
testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1
} -result [list nospace {} \xFF] -constraints testencoding
test Tcl_ExternalToUtf-bug-5be203d6ca {
truncated prefix in table encoding
} -body {
set src \x82\x4F\x82\x50\x82
set result [list [testencoding Tcl_ExternalToUtf shiftjis $src {start tcl8} 0 16 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
lappend result {*}[list [testencoding Tcl_ExternalToUtf shiftjis [string range $src $srcRead end] {end tcl8} 0 10 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
} -result [list [list multibyte 0 \xEF\xBC\x90\xEF\xBC\x91\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 4 6 2 [list ok 0 \xC2\x82\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 1 2 1] -constraints testencoding
}
namespace delete utftest
::tcltest::cleanupTests
return
# Local Variables:
# mode: tcl
# End:
|