summaryrefslogtreecommitdiffstats
path: root/tests/unicodeNormalize.test
blob: 0554a255b0726eeb2c640907846d1233969e3850 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
# See the file LICENSE for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.

if {[lsearch [namespace children] ::tcltest] == -1} {
    package require tcltest
    namespace import ::tcltest::*
}
tcltest::loadTestedCommands
package require tcl::test

source [file join [file dirname [info script]] ucdUtils.tcl]

namespace eval unicode::normalization::test {
	namespace path ::tcltests::ucd

    variable singleFormChar
    variable testCase

    variable normForm
    variable normEnums; # Matches Tcl_UnicodeNormalizationForm enums
    array set normEnums {
        nfc  0
        nfd  1
        nfkc 2
        nfkd 3
    }
    variable profileFlags; # Match TCL_ENCODING_PROFILE_* C flags
    array set profileFlags {
        strict  0x00000000
        tcl8    0x01000000
        replace 0x02000000
    }
    variable bytes

    proc hexListToChars {s} {
        # 0044 030c -> \u0044\u030c
        subst -novariables -nocommands \\U[join $s \\U]
    }

    # Standard arg number tests
    test unicode-badargs-0 {unicode no args} -returnCodes error -body {
        unicode
    } -result {wrong # args: should be "unicode subcommand ?arg ...?"}
    test unicode-badargs-1 {unicode bad command} -returnCodes error -body {
        unicode foo
    } -result {unknown or ambiguous subcommand "foo": must be tonfc, tonfd, tonfkc, or tonfkd}

    variable cmd
    foreach cmd {tonfc tonfd tonfkc tonfkd} {
        test $cmd-badargs-0 "$cmd 0 args" -returnCodes error -body {
            unicode $cmd
        } -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
        test $cmd-badargs-1 "$cmd 2 args" -returnCodes error -body {
            unicode $cmd -profile strict
        } -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
        test $cmd-badargs-2 "$cmd extra args" -returnCodes error -body {
            unicode $cmd -profile strict foo extra
        } -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
    }

    # Test generation for nfc, nfd, nfkc, nfkd
	variable allChars
	variable allNfc
	variable allNfd
	variable allNfkc
	variable allNfkd
    foreach testCase [getNormalizationData] {
        lassign $testCase lineno chars nfc nfd nfkc nfkd
		lappend allChars $chars
		lappend allNfc $nfc
		lappend allNfd $nfd
		lappend allNfkc $nfkc
		lappend allNfkd $nfkd
        test tonfc-line-$lineno \
            "Test case for NFC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
            -constraints ucdnormalization \
            -body {
                # See Comments in NormalizationTest.txt for expected behaviours
                list \
                    [string equal $nfc [unicode tonfc $chars]] \
                    [string equal $nfc [unicode tonfc $nfc]] \
                    [string equal $nfc [unicode tonfc $nfd]] \
                    [string equal $nfkc [unicode tonfc $nfkc]] \
                    [string equal $nfkc [unicode tonfc $nfkd]]
            } -result {1 1 1 1 1}

        test tonfd-line-$lineno \
            "Test case for NFD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
            -constraints ucdnormalization \
            -setup {
                readNormalizationData
            } -body {
                # See Comments in NormalizationTest.txt for expected behaviours
                list \
                    [string equal $nfd [unicode tonfd $chars]] \
                    [string equal $nfd [unicode tonfd $nfc]] \
                    [string equal $nfd [unicode tonfd $nfd]] \
                    [string equal $nfkd [unicode tonfd $nfkc]] \
                    [string equal $nfkd [unicode tonfd $nfkd]]
            } -result {1 1 1 1 1}

        test tonfkc-line-$lineno \
            "Test case for NFKC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
            -constraints ucdnormalization \
            -setup {
                readNormalizationData
            } -body {
                # See Comments in NormalizationTest.txt for expected behaviours
                list \
                    [string equal $nfkc [unicode tonfkc $chars]] \
                    [string equal $nfkc [unicode tonfkc $nfc]] \
                    [string equal $nfkc [unicode tonfkc $nfd]] \
                    [string equal $nfkc [unicode tonfkc $nfkc]] \
                    [string equal $nfkc [unicode tonfkc $nfkd]]
            } -result {1 1 1 1 1}

        test tonfkd-line-$lineno \
            "Test case for NFKD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
            -constraints ucdnormalization \
            -setup {
                readNormalizationData
            } -body {
                # See Comments in NormalizationTest.txt for expected behaviours
                list \
                    [string equal $nfkd [unicode tonfkd $chars]] \
                    [string equal $nfkd [unicode tonfkd $nfc]] \
                    [string equal $nfkd [unicode tonfkd $nfd]] \
                    [string equal $nfkd [unicode tonfkd $nfkc]] \
                    [string equal $nfkd [unicode tonfkd $nfkd]]
            } -result {1 1 1 1 1}
    }

	# Test the entire string. Note normalization is not a closed operation
	# so normalize(concatenation) != concatenate(normalization) so we insert
	# \uFFFD (replacement char) as separator to prevent adjacent cases being
	# combined. This is not a whole lot different from the above individual
	# tests but more of a "long string" test.
	test unicode-normalization-concat "Normalize concatenation of test vectors" -body {
		list \
		[string equal [unicode tonfc [join $allChars \uFFFD]] [join $allNfc \uFFFD]] \
		[string equal [unicode tonfd [join $allChars \uFFFD]] [join $allNfd \uFFFD]] \
		[string equal [unicode tonfkc [join $allChars \uFFFD]] [join $allNfkc \uFFFD]] \
		[string equal [unicode tonfkd [join $allChars \uFFFD]] [join $allNfkd \uFFFD]]
	} -result {1 1 1 1}

    # Each single form character should map to itself for all forms
    test normalize-singleform-0 "Normalize single form characters" \
        -constraints ucdnormalization \
        -body {
            lmap singleFormChar [getSingleFormChars] {
                if {[tcl::mathop::eq \
                         $singleFormChar \
                         [unicode tonfc $singleFormChar] \
                         [unicode tonfd $singleFormChar] \
                         [unicode tonfkc $singleFormChar] \
                         [unicode tonfkd $singleFormChar] \
                        ]} {
                    continue
                }
                set singleFormChar
            }
        } -result {}

    # Test generation for casefolding
	# NOTE: casefolding is not in TIP 726 so these tests are not in use
	# at the moment.
    if {[tcltest::testConstraint ucdcasefolding]} {
        foreach testCase [getCaseFoldData] {
            lassign $testCase lineno chars casefoldedchars
            set id [format %.6X [scan $chars %c]]
            test normalize-line-$lineno-$id-nfccasefold \
                "Test case for NFC_CaseFold at line $lineno of $::tcltests::ucd::caseFoldDataFile" \
                -constraints ucdcasefolding \
                -body {
                    # puts [codepoints $chars]->[codepoints $casefoldedchars]
                    # See Comments in DerivedNormalizationProps.txt for expected behaviours
                    toNFKC_Casefold $chars
                } -result $casefoldedchars
        }
        # Characters that should case fold to themselves
        proc codepoints {s} {join [lmap c [split $s ""] {
            string cat U+ [format %.6X [scan $c %c]]}]
        }
        test normalize-casefold-identities-0 \
            "NFKC Case fold chars mapping to themselves" \
            -constraints ucdcasefolding \
            -body {
                lmap char [caseFoldIdentities] {
                    if {$char eq [toNFKC_Casefold $char]} {
                        continue
                    }
                    set char
                }
            } -result {}
    }

    # Profiles
    test tonfc-profile-default-0 "tonfc -profile default success" -body {
        unicode tonfc X\u1e0a\u031b\u0323Y
    } -result X\u1e0c\u031b\u0307Y
    test tonfc-profile-default-1 "tonfc -profile default fail" -body {
        unicode tonfc X\ud800Y
    } -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
    test tonfc-profile-strict-0 "tonfc -profile strict success" -body {
        unicode tonfc -profile strict X\u1e0a\u031b\u0323Y
    } -result X\u1e0c\u031b\u0307Y
    test tonfc-profile-strict-1 "tonfc -profile strict fail" -body {
        unicode tonfc -profile strict \ud800
    } -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
    test tonfc-profile-replace-0 "tonfc -profile replace success" -body {
        unicode tonfc -profile replace X\u1e0a\u031b\u0323Y
    } -result X\u1e0c\u031b\u0307Y
    test tonfc-profile-replace-1 "tonfc -profile replace fail" -body {
        unicode tonfc -profile replace X\ud800Y
    } -result X\uFFFDY
    test tonfc-profile-tcl8-0 "tonfc -profile tcl8" -returnCodes error -body {
        unicode tonfc -profile tcl8 x
    } -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}

    test tonfd-profile-default-0 "tonfd -profile default success" -body {
        unicode tonfd X\u1E0A\u031B\u0323Y
    } -result X\u0044\u031B\u0323\u0307Y
    test tonfd-profile-default-1 "tonfd -profile default fail" -body {
        unicode tonfd \ud800
    } -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
    test tonfd-profile-strict-0 "tonfd -profile strict success" -body {
        unicode tonfd -profile strict X\u1E0A\u031B\u0323Y
    } -result X\u0044\u031B\u0323\u0307Y
    test tonfd-profile-strict-1 "tonfd -profile strict fail" -body {
        unicode tonfd -profile strict X\ud800Y
    } -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
    test tonfd-profile-replace-0 "tonfd -profile replace success" -body {
        unicode tonfd -profile replace X\u1E0A\u031B\u0323Y
    } -result X\u0044\u031B\u0323\u0307Y
    test tonfd-profile-replace-1 "tonfd -profile replace fail" -body {
        unicode tonfd -profile replace \ud800
    } -result \uFFFD
    test tonfd-profile-tcl8-0 "tonfd -profile tcl8" -returnCodes error -body {
        unicode tonfd -profile tcl8 x
    } -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}

    test tonfkc-profile-default-0 "tonfkc -profile default success" -body {
        unicode tonfkc X\u01C4\u0323Y
    } -result X\u0044\u1E92\u030CY
    test tonfkc-profile-default-1 "tonfkc -profile default fail" -body {
        unicode tonfkc X\ud800Y
    } -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
    test tonfkc-profile-strict-0 "tonfkc -profile strict success" -body {
        unicode tonfkc -profile strict X\u01C4\u0323Y
    } -result X\u0044\u1E92\u030CY
    test tonfkc-profile-strict-1 "tonfkc -profile strict fail" -body {
        unicode tonfkc -profile strict \ud800
    } -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
    test tonfkc-profile-replace-0 "tonfkc -profile replace success" -body {
        unicode tonfkc -profile replace X\u01C4\u0323Y
    } -result X\u0044\u1E92\u030CY
    test tonfkc-profile-replace-1 "tonfkc -profile replace fail" -body {
        unicode tonfkc -profile replace X\ud800Y
    } -result X\uFFFDY
    test tonfkc-profile-tcl8-0 "tonfkc -profile tcl8" -returnCodes error -body {
        unicode tonfkc -profile tcl8 x
    } -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}

    test tonfkd-profile-default-0 "tonfkd -profile default success" -body {
        unicode tonfkd X\u01C4\u0323Y
    } -result X\u0044\u005A\u0323\u030CY
    test tonfkd-profile-default-1 "tonfkd -profile default fail" -body {
        unicode tonfkd X\ud800Y
    } -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
    test tonfkd-profile-strict-0 "tonfkd -profile strict success" -body {
        unicode tonfkd -profile strict X\u01C4\u0323Y
    } -result X\u0044\u005A\u0323\u030CY
    test tonfkd-profile-strict-1 "tonfkd -profile strict fail" -body {
        unicode tonfkd -profile strict \ud800
    } -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
    test tonfkd-profile-replace-0 "tonfkd -profile replace success" -body {
        unicode tonfkd -profile replace X\u01C4\u0323Y
    } -result X\u0044\u005A\u0323\u030CY
    test tonfkd-profile-replace-1 "tonfkd -profile replace fail" -body {
        unicode tonfkd -profile replace X\ud800Y
    } -result X\uFFFDY
    test tonfkd-profile-tcl8-0 "tonfkd -profile tcl8" -returnCodes error -body {
        unicode tonfkd -profile tcl8 x
    } -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}

    # Tcl_UtfToNormalizedDString C API

    foreach testCase [getNormalizationData] {
        lassign $testCase lineno chars nfc nfd nfkc nfkd
        set bytes [teststringbytes $chars]
        foreach profile {strict replace} {
            foreach normForm {nfc nfd nfkc nfkd} {
                test Tcl_UtfToNormalizedDString-$normForm-line-$lineno-$profile \
                    "Tcl_UtfToNormalizedDString for $normForm at line $lineno of $::tcltests::ucd::normalizationDataFile" \
                    -body {
                        testutftonormalizeddstring $bytes $normEnums($normForm) \
					        $profileFlags($profile)
                    } -result [teststringbytes [set $normForm]]
            }
        }
    }

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalizedDString-$normForm-nulchar-$profile \
            "Tcl_UtfToNormalizedDString for $normForm passed nul character" \
            -body {
                testutftonormalizeddstring [teststringbytes \0] \
				$normEnums($normForm) $profileFlags(strict)
            } -result \xC0\x80
    }

    # Test the entire string. Note normalization is not a closed operation
    # so normalize(concatenation) != concatenate(normalization) so we insert
    # \uFFFD (replacement char) as separator to prevent adjacent cases being
    # combined. This is not a whole lot different from the above individual
    # tests but more of a "long string" test.
    test Tcl_UtfToNormalizedDString-concat "Normalize concatenation of test vectors" -setup {
        set bytes [teststringbytes [join $allChars \uFFFD]]
    } -body {
        list \
            [string equal \
                 [testutftonormalizeddstring $bytes $normEnums(nfc) $profileFlags(strict)] \
                 [teststringbytes [join $allNfc \uFFFD]]] \
            [string equal \
                 [testutftonormalizeddstring $bytes $normEnums(nfd) $profileFlags(strict)] \
                 [teststringbytes [join $allNfd \uFFFD]]] \
            [string equal \
                 [testutftonormalizeddstring $bytes $normEnums(nfkc) $profileFlags(strict)] \
                 [teststringbytes [join $allNfkc \uFFFD]]] \
            [string equal \
                 [testutftonormalizeddstring $bytes $normEnums(nfkd) $profileFlags(strict)] \
                 [teststringbytes [join $allNfkd \uFFFD]]]
    } -result {1 1 1 1}

    # Tcl_UtfToNormalizedDString error cases

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalizedDString-$normForm-tcl8 \
            "Tcl_UtfToNormalizedDString for $normForm profile tcl8" \
            -body {
                testutftonormalizeddstring abc $normEnums($normForm) $profileFlags(tcl8)
            } -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error

        if {0} {
            # TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
            # no way to test normalization of invalid UTF-8. Enable this test
            # once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
            test Tcl_UtfToNormalizedDString-$normForm-invalid-utf8 \
                "Tcl_UtfToNormalizedDString for $normForm invalid utf8 profile strict" \
                -body {
                    testutftonormalizeddstring [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict)
                } -result {} -returnCodes error
        }
    }

    test Tcl_UtfToNormalizedDString-invalid-normalization-form \
            "Tcl_UtfToNormalizedDString invalid value for normalization form" \
            -body {
                testutftonormalizeddstring abc 4 $profileFlags(strict)
            } -result {Invalid value 4 passed for normalization form.} -returnCodes error


    # Tcl_UtfToNormalized C API

    variable normBytes
    foreach testCase [getNormalizationData] {
        lassign $testCase lineno chars nfc nfd nfkc nfkd
        set bytes [teststringbytes $chars]
        foreach profile {strict replace} {
            foreach normForm {nfc nfd nfkc nfkd} {
                set normBytes [teststringbytes [set $normForm]]
                test Tcl_UtfToNormalized-$normForm-line-$lineno-$profile \
                    "Tcl_UtfToNormalized $normForm line $lineno of $::tcltests::ucd::normalizationDataFile" \
                    -body {
                        # Tests:
                        # No length specified (implicit length of bytes)
                        # Length of -1
                        # Buffer too small
                        set result [testutftonormalized $bytes \
                                        $normEnums($normForm) \
                                        $profileFlags($profile) 100]
                        set result_minus1 [testutftonormalized $bytes\0 \
                                        $normEnums($normForm) \
                                        $profileFlags($profile) -1 100]
                        list $result \
                            $result_minus1 \
                            [catch {
                                testutftonormalized $bytes $normEnums($normForm) \
                                    $profileFlags($profile) \
                                    [expr {[string length $result]-1}]
                            } message] \
                            $message
                    } -result [list $normBytes $normBytes -4 {Output buffer too small.}]
            }
        }
    }

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalized-$normForm-nulchar \
            "Tcl_UtfToNormalized $normForm passed nul character" \
            -body {
                list \
                    [testutftonormalized [teststringbytes \0] \
                         $normEnums($normForm) $profileFlags(strict) 3] \
                    [catch {
                        [testutftonormalized [teststringbytes \0] \
                         $normEnums($normForm) $profileFlags(strict) 2]
                    } message] \
                    $message
            } -result [list \xC0\x80 -4 {Output buffer too small.}]
    }

    # Tcl_UtfToNormalized error cases

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalized-$normForm-tcl8 \
            "Tcl_UtfToNormalized for $normForm profile tcl8" \
            -body {
                testutftonormalized abc $normEnums($normForm) $profileFlags(tcl8) 20
            } -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error

        if {0} {
            # TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
            # no way to test normalization of invalid UTF-8. Enable this test
            # once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
            test Tcl_UtfToNormalized-$normForm-invalid-utf8 \
                "Tcl_UtfToNormalized for $normForm invalid utf8 profile strict" \
                -body {
                    testutftonormalized [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict) 20
                } -result {} -returnCodes error
        }
    }

    test Tcl_UtfToNormalized-invalid-normalization-form \
            "Tcl_UtfToNormalized invalid value for normalization form" \
            -body {
                testutftonormalized abc 4 $profileFlags(strict) 20
            } -result {Invalid value 4 passed for normalization form.} -returnCodes error
}

::tcltest::cleanupTests
namespace delete unicode::normalization::test
return