diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-01 08:10:12 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-01 08:10:12 (GMT) |
commit | 3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e (patch) | |
tree | 4952179bcfbada1be4941093c77d7a531dc7f135 | |
parent | 537c8e77ba967fbb6f2ef1d7b2134420a3117bad (diff) | |
download | tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.zip tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.tar.gz tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.tar.bz2 |
(cherry-pick) Make Tcl_UniCharToUtf more readable and add test to exercise surrogate handling. (test-case was still missing, which cannot be used in Tcl 8.6)
-rw-r--r-- | generic/tclUtf.c | 14 | ||||
-rw-r--r-- | tests/encoding.test | 24 |
2 files changed, 30 insertions, 8 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index db2be84..cb8bb3e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -185,17 +185,15 @@ Invalid( * Stores the given Tcl_UniChar as a sequence of UTF-8 bytes in the * provided buffer. Equivalent to Plan 9 runetochar(). * - * Special handling of Surrogate pairs is handled as follows: - * When this function is called for ch being a high surrogate, - * the first byte of the 4-byte UTF-8 sequence is produced and - * the function returns 1. Calling the function again with a - * low surrogate, the remaining 3 bytes of the 4-byte UTF-8 - * sequence is produced, and the function returns 3. The buffer - * is used to remember the high surrogate between the two calls. + * Surrogate pairs are handled as follows: When ch is a high surrogate, + * the first byte of the 4-byte UTF-8 sequence is stored in the buffer and + * the function returns 1. If the function is called again with a low + * surrogate and the same buffer, the remaining 3 bytes of the 4-byte + * UTF-8 sequence are produced. * * If no low surrogate follows the high surrogate (which is actually * illegal), this can be handled reasonably by calling Tcl_UniCharToUtf - * again with ch = -1. This will produce a 3-byte UTF-8 sequence + * again with ch = -1. This produces a 3-byte UTF-8 sequence * representing the high surrogate. * * Results: diff --git a/tests/encoding.test b/tests/encoding.test index 10a37f8..ae6c78a 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -482,6 +482,30 @@ test encoding-16.7 {Utf32ToUtfProc} -body { list $val [format %x [scan $val %c]] } -result "乎 4e4e" +test encoding-16.8 { + Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16 +} -body { + apply [list {} { + for {set i 0xD800} {$i < 0xDBFF} {incr i} { + for {set j 0xDC00} {$j < 0xDFFF} {incr j} { + set string [binary format S2 [list $i $j]] + set status [catch { + set decoded [encoding convertfrom utf-16be $string] + set encoded [encoding convertto utf-16be $decoded] + }] + if {$status || ( $encoded ne $string )} { + return [list [format %x $i] [format %x $j]] + } + } + } + return done + } [namespace current]] +} -result done + + + + + test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" } -result "\xD8\xD8\xDC\xDC" |