From a7ec180fc75e299b71f6d839da636eff3528a713 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 31 May 2019 23:19:07 +0000
Subject: TIP #547 implementation: New encodings: UTF-16, UCS-2

---
 generic/tclEncoding.c | 226 +++++++++++++++++++++++++++++++++++++++++---------
 tests/binary.test     |   2 +-
 tests/chanio.test     |   4 +-
 tests/encoding.test   |  23 +++--
 tests/io.test         |   4 +-
 tests/ioCmd.test      |   8 +-
 tests/source.test     |   4 +-
 7 files changed, 213 insertions(+), 58 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 7eb73e8..a88c1a7 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -234,12 +234,17 @@ static int		TableToUtfProc(ClientData clientData, const char *src,
 			    char *dst, int dstLen, int *srcReadPtr,
 			    int *dstWrotePtr, int *dstCharsPtr);
 static size_t		unilen(const char *src);
-static int		UniCharToUtfProc(ClientData clientData,
+static int		Utf16ToUtfProc(ClientData clientData,
 			    const char *src, int srcLen, int flags,
 			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
 			    int *srcReadPtr, int *dstWrotePtr,
 			    int *dstCharsPtr);
-static int		UtfToUniCharProc(ClientData clientData,
+static int		UtfToUtf16Proc(ClientData clientData,
+			    const char *src, int srcLen, int flags,
+			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
+			    int *srcReadPtr, int *dstWrotePtr,
+			    int *dstCharsPtr);
+static int		UtfToUcs2Proc(ClientData clientData,
 			    const char *src, int srcLen, int flags,
 			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
 			    int *srcReadPtr, int *dstWrotePtr,
@@ -564,11 +569,16 @@ TclInitEncodingSubsystem(void)
     TableEncodingData *dataPtr;
     unsigned size;
     unsigned short i;
+    union {
+        char c;
+        short s;
+    } isLe;
 
     if (encodingsInitialized) {
 	return;
     }
 
+    isLe.s = 1;
     Tcl_MutexLock(&encodingMutex);
     Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
     Tcl_MutexUnlock(&encodingMutex);
@@ -595,13 +605,38 @@ TclInitEncodingSubsystem(void)
     type.clientData	= NULL;
     Tcl_CreateEncoding(&type);
 
-    type.encodingName   = "unicode";
-    type.toUtfProc	= UniCharToUtfProc;
-    type.fromUtfProc    = UtfToUniCharProc;
+    type.toUtfProc	= Utf16ToUtfProc;
+    type.fromUtfProc    = UtfToUcs2Proc;
     type.freeProc	= NULL;
     type.nullSize	= 2;
-    type.clientData	= NULL;
+    type.encodingName   = "ucs-2le";
+    type.clientData	= INT2PTR(1);
+    Tcl_CreateEncoding(&type);
+    type.encodingName   = "ucs-2be";
+    type.clientData	= INT2PTR(0);
+    Tcl_CreateEncoding(&type);
+    type.encodingName   = "ucs-2";
+    type.clientData	= INT2PTR(isLe.c);
+    Tcl_CreateEncoding(&type);
+
+    type.toUtfProc	= Utf16ToUtfProc;
+    type.fromUtfProc    = UtfToUtf16Proc;
+    type.freeProc	= NULL;
+    type.nullSize	= 2;
+    type.encodingName   = "utf-16le";
+    type.clientData	= INT2PTR(1);;
+    Tcl_CreateEncoding(&type);
+    type.encodingName   = "utf-16be";
+    type.clientData	= INT2PTR(0);
+    Tcl_CreateEncoding(&type);
+    type.encodingName   = "utf-16";
+    type.clientData	= INT2PTR(isLe.c);;
+    Tcl_CreateEncoding(&type);
+
+#ifndef TCL_NO_DEPRECATED
+    type.encodingName   = "unicode";
     Tcl_CreateEncoding(&type);
+#endif
 
     /*
      * Need the iso8859-1 encoding in order to process binary data, so force
@@ -1279,7 +1314,7 @@ Tcl_ExternalToUtf(
 	if (*dstCharsPtr <= maxChars) {
 	    break;
 	}
-	dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX;
+	dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
 	flags = savedFlags;
 	*statePtr = savedState;
     } while (1);
@@ -2401,9 +2436,9 @@ UtfToUtfProc(
 /*
  *-------------------------------------------------------------------------
  *
- * UniCharToUtfProc --
+ * Utf16ToUtfProc --
  *
- *	Convert from Unicode to UTF-8.
+ *	Convert from UTF-16 to UTF-8.
  *
  * Results:
  *	Returns TCL_OK if conversion was successful.
@@ -2415,8 +2450,8 @@ UtfToUtfProc(
  */
 
 static int
-UniCharToUtfProc(
-    ClientData clientData,	/* Not used. */
+Utf16ToUtfProc(
+    ClientData clientData,	/* != NULL means LE, == NUL means BE */
     const char *src,		/* Source string in Unicode. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
@@ -2468,12 +2503,15 @@ UniCharToUtfProc(
 	    break;
 	}
 
+	if (clientData) {
+	    ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
+	} else {
+	    ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
+	}
 	/*
 	 * Special case for 1-byte utf chars for speed. Make sure we work with
 	 * unsigned short-size data.
 	 */
-
-	ch = *(unsigned short *)src;
 	if (ch && ch < 0x80) {
 	    *dst++ = (ch & 0xFF);
 	} else {
@@ -2491,9 +2529,9 @@ UniCharToUtfProc(
 /*
  *-------------------------------------------------------------------------
  *
- * UtfToUniCharProc --
+ * UtfToUtf16Proc --
  *
- *	Convert from UTF-8 to Unicode.
+ *	Convert from UTF-8 to UTF-16.
  *
  * Results:
  *	Returns TCL_OK if conversion was successful.
@@ -2505,9 +2543,8 @@ UniCharToUtfProc(
  */
 
 static int
-UtfToUniCharProc(
-    ClientData clientData,	/* TableEncodingData that specifies
-				 * encoding. */
+UtfToUtf16Proc(
+    ClientData clientData,	/* != NULL means LE, == NUL means BE */
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
@@ -2571,44 +2608,151 @@ UtfToUniCharProc(
 	 * casting dst to a Tcl_UniChar. [Bug 1122671]
 	 */
 
-#ifdef WORDS_BIGENDIAN
+	if (clientData) {
 #if TCL_UTF_MAX > 4
-	if (*chPtr <= 0xFFFF) {
-	    *dst++ = (*chPtr >> 8);
-	    *dst++ = (*chPtr & 0xFF);
-	} else {
-	    *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
-	    *dst++ = (*chPtr & 0xFF);
-	    *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
-	    *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
-	}
-#else
-	*dst++ = (*chPtr >> 8);
-	*dst++ = (*chPtr & 0xFF);
-#endif
+	    if (*chPtr <= 0xFFFF) {
+		*dst++ = (*chPtr & 0xFF);
+		*dst++ = (*chPtr >> 8);
+	    } else {
+		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
+		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+		*dst++ = (*chPtr & 0xFF);
+		*dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+	    }
 #else
-#if TCL_UTF_MAX > 4
-	if (*chPtr <= 0xFFFF) {
 	    *dst++ = (*chPtr & 0xFF);
 	    *dst++ = (*chPtr >> 8);
+#endif
 	} else {
-	    *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
-	    *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+#if TCL_UTF_MAX > 4
+	    if (*chPtr <= 0xFFFF) {
+		*dst++ = (*chPtr >> 8);
+		*dst++ = (*chPtr & 0xFF);
+	    } else {
+		*dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+		*dst++ = (*chPtr & 0xFF);
+		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
+	    }
+#else
+	    *dst++ = (*chPtr >> 8);
 	    *dst++ = (*chPtr & 0xFF);
-	    *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+#endif
 	}
-#else
-	*dst++ = (*chPtr & 0xFF);
-	*dst++ = (*chPtr >> 8);
+    }
+    *srcReadPtr = src - srcStart;
+    *dstWrotePtr = dst - dstStart;
+    *dstCharsPtr = numChars;
+    return result;
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfToUcs2Proc --
+ *
+ *	Convert from UTF-8 to UCS-2.
+ *
+ * Results:
+ *	Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ *	None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static int
+UtfToUcs2Proc(
+    ClientData clientData,	/* != NULL means LE, == NUL means BE */
+    const char *src,		/* Source string in UTF-8. */
+    int srcLen,			/* Source string length in bytes. */
+    int flags,			/* Conversion control flags. */
+    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
+				 * information used during a piecewise
+				 * conversion. Contents of statePtr are
+				 * initialized and/or reset by conversion
+				 * routine under control of flags argument. */
+    char *dst,			/* Output buffer in which converted string is
+				 * stored. */
+    int dstLen,			/* The maximum length of output buffer in
+				 * bytes. */
+    int *srcReadPtr,		/* Filled with the number of bytes from the
+				 * source string that were converted. This may
+				 * be less than the original source length if
+				 * there was a problem converting some source
+				 * characters. */
+    int *dstWrotePtr,		/* Filled with the number of bytes that were
+				 * stored in the output buffer as a result of
+				 * the conversion. */
+    int *dstCharsPtr)		/* Filled with the number of characters that
+				 * correspond to the bytes stored in the
+				 * output buffer. */
+{
+    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
+    int result, numChars;
+#if TCL_UTF_MAX <= 4
+    int len;
 #endif
+    Tcl_UniChar ch = 0;
+
+    srcStart = src;
+    srcEnd = src + srcLen;
+    srcClose = srcEnd;
+    if ((flags & TCL_ENCODING_END) == 0) {
+	srcClose -= TCL_UTF_MAX;
+    }
+
+    dstStart = dst;
+    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);
+
+    result = TCL_OK;
+    for (numChars = 0; src < srcEnd; numChars++) {
+	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+	    /*
+	     * If there is more string to follow, this will ensure that the
+	     * last UTF-8 character in the source buffer hasn't been cut off.
+	     */
+
+	    result = TCL_CONVERT_MULTIBYTE;
+	    break;
+	}
+	if (dst > dstEnd) {
+	    result = TCL_CONVERT_NOSPACE;
+	    break;
+	}
+#if TCL_UTF_MAX <= 4
+	src += (len = TclUtfToUniChar(src, &ch));
+	if ((ch >= 0xD800) && (len < 3)) {
+	    src += TclUtfToUniChar(src, &ch);
+	    ch = 0xFFFD;
+	}
+#else
+	src += TclUtfToUniChar(src, &ch);
+	if (ch > 0xFFFF) {
+	    ch = 0xFFFD;
+	}
 #endif
+
+	/*
+	 * Need to handle this in a way that won't cause misalignment by
+	 * casting dst to a Tcl_UniChar. [Bug 1122671]
+	 */
+
+	if (clientData) {
+	    *dst++ = (ch & 0xFF);
+	    *dst++ = (ch >> 8);
+	} else {
+	    *dst++ = (ch >> 8);
+	    *dst++ = (ch & 0xFF);
+	}
     }
     *srcReadPtr = src - srcStart;
     *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
     return result;
 }
-
+
 /*
  *-------------------------------------------------------------------------
  *
diff --git a/tests/binary.test b/tests/binary.test
index aede659..973240f 100644
--- a/tests/binary.test
+++ b/tests/binary.test
@@ -2912,7 +2912,7 @@ test binary-77.2 {string cat ops on all bytearrays} {
 } [binary format H* abcd]
 
 test binary-78.1 {unicode (out of BMP) to byte-array conversion, bug-[bd94500678]} -body {
-    # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX <= 4):
+    # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX == 3):
     binary encode hex \U0001f415
     binary scan \U0001f415 a* v; set v
     set str {}
diff --git a/tests/chanio.test b/tests/chanio.test
index 9dc9e7c..1439fe4 100644
--- a/tests/chanio.test
+++ b/tests/chanio.test
@@ -888,7 +888,7 @@ test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} -setup
     # Tcl_ExternalToUtf()
     set f [openpipe w+ $path(cat)]
     chan configure $f -translation {auto lf} -buffering none
-    chan configure $f -encoding unicode
+    chan configure $f -encoding utf-16
     chan puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
     chan configure $f -buffersize 16
     chan gets $f
@@ -1129,7 +1129,7 @@ test chan-io-8.2 {PeekAhead: only go to device if no more cached data} -setup {
     chan event $f read [namespace code {
 	lappend x [chan gets $f line] $line [testchannel inputbuffered $f]
     }]
-    chan configure $f -encoding unicode -buffersize 16 -blocking 0
+    chan configure $f -encoding utf-16 -buffersize 16 -blocking 0
     vwait [namespace which -variable x]
     chan configure $f -translation auto -encoding ascii -blocking 1
     # here
diff --git a/tests/encoding.test b/tests/encoding.test
index 4736928..da34f03 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -322,18 +322,29 @@ test encoding-15.3 {UtfToUtfProc null character input} teststringbytes {
     set z
 } c080
 
-test encoding-16.1 {UnicodeToUtfProc} -body {
-    set val [encoding convertfrom unicode NN]
+test encoding-16.1 {Utf16ToUtfProc} -body {
+    set val [encoding convertfrom utf-16 NN]
     list $val [format %x [scan $val %c]]
 } -result "\u4e4e 4e4e"
-test encoding-16.2 {UnicodeToUtfProc} -body {
-    set val [encoding convertfrom unicode "\xd8\xd8\xdc\xdc"]
+test encoding-16.2 {Utf16ToUtfProc} -body {
+    set val [encoding convertfrom utf-16 "\xd8\xd8\xdc\xdc"]
+    list $val [format %x [scan $val %c]]
+} -result "\U460dc 460dc"
+test encoding-16.3 {Ucs2ToUtfProc} -body {
+    set val [encoding convertfrom ucs-2 NN]
+    list $val [format %x [scan $val %c]]
+} -result "\u4e4e 4e4e"
+test encoding-16.4 {Ucs2ToUtfProc} -body {
+    set val [encoding convertfrom ucs-2 "\xd8\xd8\xdc\xdc"]
     list $val [format %x [scan $val %c]]
 } -result "\U460dc 460dc"
 
-test encoding-17.1 {UtfToUnicodeProc} -body {
-    encoding convertto unicode "\U460dc"
+test encoding-17.1 {UtfToUtf16Proc} -body {
+    encoding convertto utf-16 "\U460dc"
 } -result "\xd8\xd8\xdc\xdc"
+test encoding-17.2 {UtfToUcs2Proc} -body {
+    encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460dc"]
+} -result "\ufffd"
 
 test encoding-18.1 {TableToUtfProc} {
 } {}
diff --git a/tests/io.test b/tests/io.test
index 6470282..39deab6 100644
--- a/tests/io.test
+++ b/tests/io.test
@@ -918,7 +918,7 @@ test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio test
 
     set f [open "|[list [interpreter] $path(cat)]" w+]
     fconfigure $f -translation {auto lf} -buffering none
-    fconfigure $f -encoding unicode
+    fconfigure $f -encoding utf-16
     puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
     fconfigure $f -buffersize 16
     gets $f
@@ -1162,7 +1162,7 @@ test io-8.2 {PeekAhead: only go to device if no more cached data} {stdio testcha
 	variable x
 	lappend x [gets $f line] $line [testchannel inputbuffered $f]
     }
-    fconfigure $f -encoding unicode -buffersize 16 -blocking 0
+    fconfigure $f -encoding utf-16 -buffersize 16 -blocking 0
     vwait [namespace which -variable x]
     fconfigure $f -translation auto -encoding ascii -blocking 1
     # here
diff --git a/tests/ioCmd.test b/tests/ioCmd.test
index a967139..87ad4af 100644
--- a/tests/ioCmd.test
+++ b/tests/ioCmd.test
@@ -241,23 +241,23 @@ test iocmd-8.7 {fconfigure command} -setup {
     file delete $path(test1)
 } -body {
     set f1 [open $path(test1) w]
-    fconfigure $f1 -translation lf -eofchar {} -encoding unicode
+    fconfigure $f1 -translation lf -eofchar {} -encoding utf-16
     fconfigure $f1
 } -cleanup {
     catch {close $f1}
-} -result {-blocking 1 -buffering full -buffersize 4096 -encoding unicode -eofchar {} -translation lf}
+} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -eofchar {} -translation lf}
 test iocmd-8.8 {fconfigure command} -setup {
     file delete $path(test1)
     set x {}
 } -body {
     set f1 [open $path(test1) w]
     fconfigure $f1 -translation lf -buffering line -buffersize 3030 \
-		-eofchar {} -encoding unicode
+		-eofchar {} -encoding utf-16
     lappend x [fconfigure $f1 -buffering]
     lappend x [fconfigure $f1]
 } -cleanup {
     catch {close $f1}
-} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding unicode -eofchar {} -translation lf}}
+} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -eofchar {} -translation lf}}
 test iocmd-8.9 {fconfigure command} -setup {
     file delete $path(test1)
 } -body {
diff --git a/tests/source.test b/tests/source.test
index 8b146d3..c6cccd6 100644
--- a/tests/source.test
+++ b/tests/source.test
@@ -240,12 +240,12 @@ test source-7.2 {source -encoding test} -setup {
     set sourcefile [makeFile {} source.file]
     file delete $sourcefile
     set f [open $sourcefile w]
-    fconfigure $f -encoding unicode
+    fconfigure $f -encoding utf-16
     puts $f "set symbol(square-root) \u221A; set x correct"
     close $f
 } -body {
     set x unset
-    source -encoding unicode $sourcefile
+    source -encoding utf-16 $sourcefile
     set x
 } -cleanup {
     removeFile source.file
-- 
cgit v0.12