(cherry-pick) Make Tcl_UniCharToUtf more readable and add test to exercise surrogate handling. (test-case was still missing, which cannot be used in Tcl 8.6)

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2023-02-01 08:10:12 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2023-02-01 08:10:12 (GMT)
commit: 3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e (patch)
tree: 4952179bcfbada1be4941093c77d7a531dc7f135
parent: 537c8e77ba967fbb6f2ef1d7b2134420a3117bad (diff)
download: tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.zip
tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.tar.gz
tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.tar.bz2
2 files changed, 30 insertions, 8 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index db2be84..cb8bb3e 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -185,17 +185,15 @@ Invalid(
  *	Stores the given Tcl_UniChar as a sequence of UTF-8 bytes in the
  *	provided buffer. Equivalent to Plan 9 runetochar().
  *
- *	Special handling of Surrogate pairs is handled as follows:
- *	When this function is called for ch being a high surrogate,
- *	the first byte of the 4-byte UTF-8 sequence is produced and
- *	the function returns 1. Calling the function again with a
- *	low surrogate, the remaining 3 bytes of the 4-byte UTF-8
- *	sequence is produced, and the function returns 3. The buffer
- *	is used to remember the high surrogate between the two calls.
+ *	Surrogate pairs are handled as follows: When ch is a high surrogate,
+ *	the first byte of the 4-byte UTF-8 sequence is stored in the buffer and
+ *	the function returns 1. If the function is called again with a low
+ *	surrogate and the same buffer, the remaining 3 bytes of the 4-byte
+ *	UTF-8 sequence are produced.
  *
  *	If no low surrogate follows the high surrogate (which is actually
  *	illegal), this can be handled reasonably by calling Tcl_UniCharToUtf
- *	again with ch = -1. This will produce a 3-byte UTF-8 sequence
+ *	again with ch = -1. This produces a 3-byte UTF-8 sequence
  *	representing the high surrogate.
  *
  * Results:
diff --git a/tests/encoding.test b/tests/encoding.test
index 10a37f8..ae6c78a 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -482,6 +482,30 @@ test encoding-16.7 {Utf32ToUtfProc} -body {
     list $val [format %x [scan $val %c]]
 } -result "乎 4e4e"
 
+test encoding-16.8 {
+    Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16
+} -body {
+    apply [list {} {
+	for {set i 0xD800} {$i < 0xDBFF} {incr i} {
+	    for {set j 0xDC00} {$j < 0xDFFF} {incr j} {
+		set string [binary format S2 [list $i $j]]
+		set status [catch {
+		    set decoded [encoding convertfrom utf-16be $string]
+		    set encoded [encoding convertto utf-16be $decoded]
+		}]
+		if {$status || ( $encoded ne $string )} {
+		    return [list [format %x $i] [format %x $j]]
+		}
+	    }
+	}
+	return done
+    } [namespace current]]
+} -result done
+
+
+
+
+
 test encoding-17.1 {UtfToUtf16Proc} -body {
     encoding convertto utf-16 "\U460DC"
 } -result "\xD8\xD8\xDC\xDC"
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2023-02-01 08:10:12 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2023-02-01 08:10:12 (GMT)
commit	3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e (patch)
tree	4952179bcfbada1be4941093c77d7a531dc7f135
parent	537c8e77ba967fbb6f2ef1d7b2134420a3117bad (diff)
download	tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.zip tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.tar.gz tcl-3eaad4bbc95c9cb3eaaf79872646d4fa7f6d8c6e.tar.bz2