blob: 0554a255b0726eeb2c640907846d1233969e3850 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
|
# See the file LICENSE for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
if {[lsearch [namespace children] ::tcltest] == -1} {
package require tcltest
namespace import ::tcltest::*
}
tcltest::loadTestedCommands
package require tcl::test
source [file join [file dirname [info script]] ucdUtils.tcl]
namespace eval unicode::normalization::test {
namespace path ::tcltests::ucd
variable singleFormChar
variable testCase
variable normForm
variable normEnums; # Matches Tcl_UnicodeNormalizationForm enums
array set normEnums {
nfc 0
nfd 1
nfkc 2
nfkd 3
}
variable profileFlags; # Match TCL_ENCODING_PROFILE_* C flags
array set profileFlags {
strict 0x00000000
tcl8 0x01000000
replace 0x02000000
}
variable bytes
proc hexListToChars {s} {
# 0044 030c -> \u0044\u030c
subst -novariables -nocommands \\U[join $s \\U]
}
# Standard arg number tests
test unicode-badargs-0 {unicode no args} -returnCodes error -body {
unicode
} -result {wrong # args: should be "unicode subcommand ?arg ...?"}
test unicode-badargs-1 {unicode bad command} -returnCodes error -body {
unicode foo
} -result {unknown or ambiguous subcommand "foo": must be tonfc, tonfd, tonfkc, or tonfkd}
variable cmd
foreach cmd {tonfc tonfd tonfkc tonfkd} {
test $cmd-badargs-0 "$cmd 0 args" -returnCodes error -body {
unicode $cmd
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
test $cmd-badargs-1 "$cmd 2 args" -returnCodes error -body {
unicode $cmd -profile strict
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
test $cmd-badargs-2 "$cmd extra args" -returnCodes error -body {
unicode $cmd -profile strict foo extra
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
}
# Test generation for nfc, nfd, nfkc, nfkd
variable allChars
variable allNfc
variable allNfd
variable allNfkc
variable allNfkd
foreach testCase [getNormalizationData] {
lassign $testCase lineno chars nfc nfd nfkc nfkd
lappend allChars $chars
lappend allNfc $nfc
lappend allNfd $nfd
lappend allNfkc $nfkc
lappend allNfkd $nfkd
test tonfc-line-$lineno \
"Test case for NFC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfc [unicode tonfc $chars]] \
[string equal $nfc [unicode tonfc $nfc]] \
[string equal $nfc [unicode tonfc $nfd]] \
[string equal $nfkc [unicode tonfc $nfkc]] \
[string equal $nfkc [unicode tonfc $nfkd]]
} -result {1 1 1 1 1}
test tonfd-line-$lineno \
"Test case for NFD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-setup {
readNormalizationData
} -body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfd [unicode tonfd $chars]] \
[string equal $nfd [unicode tonfd $nfc]] \
[string equal $nfd [unicode tonfd $nfd]] \
[string equal $nfkd [unicode tonfd $nfkc]] \
[string equal $nfkd [unicode tonfd $nfkd]]
} -result {1 1 1 1 1}
test tonfkc-line-$lineno \
"Test case for NFKC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-setup {
readNormalizationData
} -body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfkc [unicode tonfkc $chars]] \
[string equal $nfkc [unicode tonfkc $nfc]] \
[string equal $nfkc [unicode tonfkc $nfd]] \
[string equal $nfkc [unicode tonfkc $nfkc]] \
[string equal $nfkc [unicode tonfkc $nfkd]]
} -result {1 1 1 1 1}
test tonfkd-line-$lineno \
"Test case for NFKD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-setup {
readNormalizationData
} -body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfkd [unicode tonfkd $chars]] \
[string equal $nfkd [unicode tonfkd $nfc]] \
[string equal $nfkd [unicode tonfkd $nfd]] \
[string equal $nfkd [unicode tonfkd $nfkc]] \
[string equal $nfkd [unicode tonfkd $nfkd]]
} -result {1 1 1 1 1}
}
# Test the entire string. Note normalization is not a closed operation
# so normalize(concatenation) != concatenate(normalization) so we insert
# \uFFFD (replacement char) as separator to prevent adjacent cases being
# combined. This is not a whole lot different from the above individual
# tests but more of a "long string" test.
test unicode-normalization-concat "Normalize concatenation of test vectors" -body {
list \
[string equal [unicode tonfc [join $allChars \uFFFD]] [join $allNfc \uFFFD]] \
[string equal [unicode tonfd [join $allChars \uFFFD]] [join $allNfd \uFFFD]] \
[string equal [unicode tonfkc [join $allChars \uFFFD]] [join $allNfkc \uFFFD]] \
[string equal [unicode tonfkd [join $allChars \uFFFD]] [join $allNfkd \uFFFD]]
} -result {1 1 1 1}
# Each single form character should map to itself for all forms
test normalize-singleform-0 "Normalize single form characters" \
-constraints ucdnormalization \
-body {
lmap singleFormChar [getSingleFormChars] {
if {[tcl::mathop::eq \
$singleFormChar \
[unicode tonfc $singleFormChar] \
[unicode tonfd $singleFormChar] \
[unicode tonfkc $singleFormChar] \
[unicode tonfkd $singleFormChar] \
]} {
continue
}
set singleFormChar
}
} -result {}
# Test generation for casefolding
# NOTE: casefolding is not in TIP 726 so these tests are not in use
# at the moment.
if {[tcltest::testConstraint ucdcasefolding]} {
foreach testCase [getCaseFoldData] {
lassign $testCase lineno chars casefoldedchars
set id [format %.6X [scan $chars %c]]
test normalize-line-$lineno-$id-nfccasefold \
"Test case for NFC_CaseFold at line $lineno of $::tcltests::ucd::caseFoldDataFile" \
-constraints ucdcasefolding \
-body {
# puts [codepoints $chars]->[codepoints $casefoldedchars]
# See Comments in DerivedNormalizationProps.txt for expected behaviours
toNFKC_Casefold $chars
} -result $casefoldedchars
}
# Characters that should case fold to themselves
proc codepoints {s} {join [lmap c [split $s ""] {
string cat U+ [format %.6X [scan $c %c]]}]
}
test normalize-casefold-identities-0 \
"NFKC Case fold chars mapping to themselves" \
-constraints ucdcasefolding \
-body {
lmap char [caseFoldIdentities] {
if {$char eq [toNFKC_Casefold $char]} {
continue
}
set char
}
} -result {}
}
# Profiles
test tonfc-profile-default-0 "tonfc -profile default success" -body {
unicode tonfc X\u1e0a\u031b\u0323Y
} -result X\u1e0c\u031b\u0307Y
test tonfc-profile-default-1 "tonfc -profile default fail" -body {
unicode tonfc X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfc-profile-strict-0 "tonfc -profile strict success" -body {
unicode tonfc -profile strict X\u1e0a\u031b\u0323Y
} -result X\u1e0c\u031b\u0307Y
test tonfc-profile-strict-1 "tonfc -profile strict fail" -body {
unicode tonfc -profile strict \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfc-profile-replace-0 "tonfc -profile replace success" -body {
unicode tonfc -profile replace X\u1e0a\u031b\u0323Y
} -result X\u1e0c\u031b\u0307Y
test tonfc-profile-replace-1 "tonfc -profile replace fail" -body {
unicode tonfc -profile replace X\ud800Y
} -result X\uFFFDY
test tonfc-profile-tcl8-0 "tonfc -profile tcl8" -returnCodes error -body {
unicode tonfc -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
test tonfd-profile-default-0 "tonfd -profile default success" -body {
unicode tonfd X\u1E0A\u031B\u0323Y
} -result X\u0044\u031B\u0323\u0307Y
test tonfd-profile-default-1 "tonfd -profile default fail" -body {
unicode tonfd \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfd-profile-strict-0 "tonfd -profile strict success" -body {
unicode tonfd -profile strict X\u1E0A\u031B\u0323Y
} -result X\u0044\u031B\u0323\u0307Y
test tonfd-profile-strict-1 "tonfd -profile strict fail" -body {
unicode tonfd -profile strict X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfd-profile-replace-0 "tonfd -profile replace success" -body {
unicode tonfd -profile replace X\u1E0A\u031B\u0323Y
} -result X\u0044\u031B\u0323\u0307Y
test tonfd-profile-replace-1 "tonfd -profile replace fail" -body {
unicode tonfd -profile replace \ud800
} -result \uFFFD
test tonfd-profile-tcl8-0 "tonfd -profile tcl8" -returnCodes error -body {
unicode tonfd -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
test tonfkc-profile-default-0 "tonfkc -profile default success" -body {
unicode tonfkc X\u01C4\u0323Y
} -result X\u0044\u1E92\u030CY
test tonfkc-profile-default-1 "tonfkc -profile default fail" -body {
unicode tonfkc X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfkc-profile-strict-0 "tonfkc -profile strict success" -body {
unicode tonfkc -profile strict X\u01C4\u0323Y
} -result X\u0044\u1E92\u030CY
test tonfkc-profile-strict-1 "tonfkc -profile strict fail" -body {
unicode tonfkc -profile strict \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfkc-profile-replace-0 "tonfkc -profile replace success" -body {
unicode tonfkc -profile replace X\u01C4\u0323Y
} -result X\u0044\u1E92\u030CY
test tonfkc-profile-replace-1 "tonfkc -profile replace fail" -body {
unicode tonfkc -profile replace X\ud800Y
} -result X\uFFFDY
test tonfkc-profile-tcl8-0 "tonfkc -profile tcl8" -returnCodes error -body {
unicode tonfkc -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
test tonfkd-profile-default-0 "tonfkd -profile default success" -body {
unicode tonfkd X\u01C4\u0323Y
} -result X\u0044\u005A\u0323\u030CY
test tonfkd-profile-default-1 "tonfkd -profile default fail" -body {
unicode tonfkd X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfkd-profile-strict-0 "tonfkd -profile strict success" -body {
unicode tonfkd -profile strict X\u01C4\u0323Y
} -result X\u0044\u005A\u0323\u030CY
test tonfkd-profile-strict-1 "tonfkd -profile strict fail" -body {
unicode tonfkd -profile strict \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfkd-profile-replace-0 "tonfkd -profile replace success" -body {
unicode tonfkd -profile replace X\u01C4\u0323Y
} -result X\u0044\u005A\u0323\u030CY
test tonfkd-profile-replace-1 "tonfkd -profile replace fail" -body {
unicode tonfkd -profile replace X\ud800Y
} -result X\uFFFDY
test tonfkd-profile-tcl8-0 "tonfkd -profile tcl8" -returnCodes error -body {
unicode tonfkd -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
# Tcl_UtfToNormalizedDString C API
foreach testCase [getNormalizationData] {
lassign $testCase lineno chars nfc nfd nfkc nfkd
set bytes [teststringbytes $chars]
foreach profile {strict replace} {
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalizedDString-$normForm-line-$lineno-$profile \
"Tcl_UtfToNormalizedDString for $normForm at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-body {
testutftonormalizeddstring $bytes $normEnums($normForm) \
$profileFlags($profile)
} -result [teststringbytes [set $normForm]]
}
}
}
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalizedDString-$normForm-nulchar-$profile \
"Tcl_UtfToNormalizedDString for $normForm passed nul character" \
-body {
testutftonormalizeddstring [teststringbytes \0] \
$normEnums($normForm) $profileFlags(strict)
} -result \xC0\x80
}
# Test the entire string. Note normalization is not a closed operation
# so normalize(concatenation) != concatenate(normalization) so we insert
# \uFFFD (replacement char) as separator to prevent adjacent cases being
# combined. This is not a whole lot different from the above individual
# tests but more of a "long string" test.
test Tcl_UtfToNormalizedDString-concat "Normalize concatenation of test vectors" -setup {
set bytes [teststringbytes [join $allChars \uFFFD]]
} -body {
list \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfc) $profileFlags(strict)] \
[teststringbytes [join $allNfc \uFFFD]]] \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfd) $profileFlags(strict)] \
[teststringbytes [join $allNfd \uFFFD]]] \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfkc) $profileFlags(strict)] \
[teststringbytes [join $allNfkc \uFFFD]]] \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfkd) $profileFlags(strict)] \
[teststringbytes [join $allNfkd \uFFFD]]]
} -result {1 1 1 1}
# Tcl_UtfToNormalizedDString error cases
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalizedDString-$normForm-tcl8 \
"Tcl_UtfToNormalizedDString for $normForm profile tcl8" \
-body {
testutftonormalizeddstring abc $normEnums($normForm) $profileFlags(tcl8)
} -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error
if {0} {
# TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
# no way to test normalization of invalid UTF-8. Enable this test
# once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
test Tcl_UtfToNormalizedDString-$normForm-invalid-utf8 \
"Tcl_UtfToNormalizedDString for $normForm invalid utf8 profile strict" \
-body {
testutftonormalizeddstring [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict)
} -result {} -returnCodes error
}
}
test Tcl_UtfToNormalizedDString-invalid-normalization-form \
"Tcl_UtfToNormalizedDString invalid value for normalization form" \
-body {
testutftonormalizeddstring abc 4 $profileFlags(strict)
} -result {Invalid value 4 passed for normalization form.} -returnCodes error
# Tcl_UtfToNormalized C API
variable normBytes
foreach testCase [getNormalizationData] {
lassign $testCase lineno chars nfc nfd nfkc nfkd
set bytes [teststringbytes $chars]
foreach profile {strict replace} {
foreach normForm {nfc nfd nfkc nfkd} {
set normBytes [teststringbytes [set $normForm]]
test Tcl_UtfToNormalized-$normForm-line-$lineno-$profile \
"Tcl_UtfToNormalized $normForm line $lineno of $::tcltests::ucd::normalizationDataFile" \
-body {
# Tests:
# No length specified (implicit length of bytes)
# Length of -1
# Buffer too small
set result [testutftonormalized $bytes \
$normEnums($normForm) \
$profileFlags($profile) 100]
set result_minus1 [testutftonormalized $bytes\0 \
$normEnums($normForm) \
$profileFlags($profile) -1 100]
list $result \
$result_minus1 \
[catch {
testutftonormalized $bytes $normEnums($normForm) \
$profileFlags($profile) \
[expr {[string length $result]-1}]
} message] \
$message
} -result [list $normBytes $normBytes -4 {Output buffer too small.}]
}
}
}
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalized-$normForm-nulchar \
"Tcl_UtfToNormalized $normForm passed nul character" \
-body {
list \
[testutftonormalized [teststringbytes \0] \
$normEnums($normForm) $profileFlags(strict) 3] \
[catch {
[testutftonormalized [teststringbytes \0] \
$normEnums($normForm) $profileFlags(strict) 2]
} message] \
$message
} -result [list \xC0\x80 -4 {Output buffer too small.}]
}
# Tcl_UtfToNormalized error cases
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalized-$normForm-tcl8 \
"Tcl_UtfToNormalized for $normForm profile tcl8" \
-body {
testutftonormalized abc $normEnums($normForm) $profileFlags(tcl8) 20
} -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error
if {0} {
# TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
# no way to test normalization of invalid UTF-8. Enable this test
# once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
test Tcl_UtfToNormalized-$normForm-invalid-utf8 \
"Tcl_UtfToNormalized for $normForm invalid utf8 profile strict" \
-body {
testutftonormalized [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict) 20
} -result {} -returnCodes error
}
}
test Tcl_UtfToNormalized-invalid-normalization-form \
"Tcl_UtfToNormalized invalid value for normalization form" \
-body {
testutftonormalized abc 4 $profileFlags(strict) 20
} -result {Invalid value 4 passed for normalization form.} -returnCodes error
}
::tcltest::cleanupTests
namespace delete unicode::normalization::test
return
|