From be9f231fdaea85177d206ea98320c888b73f00d8 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 13 Oct 2021 13:33:43 +0000
Subject: First shot at [cef426ff2c]: Encoding UTF-32 missing

---
 generic/tclEncoding.c | 249 +++++++++++++++++++++++++++++++++++++++++++++++---
 tests/encoding.test   |  24 ++++-
 2 files changed, 258 insertions(+), 15 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 61a931d..4166e45 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -45,7 +45,9 @@ typedef struct {
 				 * If nullSize is 1, this is strlen; if
 				 * nullSize is 2, this is a function that
 				 * returns the number of bytes in a 0x0000
-				 * terminated string. */
+				 * terminated string; if nullSize is 4, this
+				 * is a function that returns the number of bytes
+				 * in a 0x00000000 terminated string. */
     size_t refCount;		/* Number of uses of this structure. */
     Tcl_HashEntry *hPtr;	/* Hash table entry that owns this encoding. */
 } Encoding;
@@ -216,7 +218,10 @@ static Tcl_Channel		OpenEncodingFileChannel(Tcl_Interp *interp,
 static Tcl_EncodingFreeProc	TableFreeProc;
 static Tcl_EncodingConvertProc	TableFromUtfProc;
 static Tcl_EncodingConvertProc	TableToUtfProc;
+static size_t			char16len(const char *src);
 static size_t			unilen(const char *src);
+static Tcl_EncodingConvertProc	Utf32ToUtfProc;
+static Tcl_EncodingConvertProc	UtfToUtf32Proc;
 static Tcl_EncodingConvertProc	Utf16ToUtfProc;
 static Tcl_EncodingConvertProc	UtfToUtf16Proc;
 static Tcl_EncodingConvertProc	UtfToUcs2Proc;
@@ -577,6 +582,20 @@ TclInitEncodingSubsystem(void)
     type.clientData	= INT2PTR(isLe.c);
     Tcl_CreateEncoding(&type);
 
+    type.toUtfProc	= Utf32ToUtfProc;
+    type.fromUtfProc    = UtfToUtf32Proc;
+    type.freeProc	= NULL;
+    type.nullSize	= 4;
+    type.encodingName   = "utf-32le";
+    type.clientData	= INT2PTR(TCL_ENCODING_LE);
+    Tcl_CreateEncoding(&type);
+    type.encodingName   = "utf-32be";
+    type.clientData	= INT2PTR(0);
+    Tcl_CreateEncoding(&type);
+    type.encodingName   = "utf-32";
+    type.clientData	= INT2PTR(isLe.c);
+    Tcl_CreateEncoding(&type);
+
     type.toUtfProc	= Utf16ToUtfProc;
     type.fromUtfProc    = UtfToUtf16Proc;
     type.freeProc	= NULL;
@@ -1057,10 +1076,12 @@ Tcl_CreateEncoding(
     encodingPtr->freeProc	= typePtr->freeProc;
     encodingPtr->nullSize	= typePtr->nullSize;
     encodingPtr->clientData	= typePtr->clientData;
-    if (typePtr->nullSize == 1) {
-	encodingPtr->lengthProc = (LengthProc *) strlen;
-    } else {
+    if (typePtr->nullSize == 2) {
+	encodingPtr->lengthProc = (LengthProc *) char16len;
+    } else if (typePtr->nullSize == 4) {
 	encodingPtr->lengthProc = (LengthProc *) unilen;
+    } else {
+	encodingPtr->lengthProc = (LengthProc *) strlen;
     }
     encodingPtr->refCount	= 1;
     encodingPtr->hPtr		= NULL;
@@ -1343,10 +1364,10 @@ Tcl_UtfToExternalDString(
 
 	src += srcRead;
 	if (result != TCL_CONVERT_NOSPACE) {
-	    if (encodingPtr->nullSize == 2) {
-		Tcl_DStringSetLength(dstPtr, soFar + 1);
+		int i = soFar + encodingPtr->nullSize - 1;
+	    while (i >= soFar) {
+		Tcl_DStringSetLength(dstPtr, i--);
 	    }
-	    Tcl_DStringSetLength(dstPtr, soFar);
 	    return Tcl_DStringValue(dstPtr);
 	}
 
@@ -1441,10 +1462,7 @@ Tcl_UtfToExternal(
     result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
 	    flags, statePtr, dst, dstLen, srcReadPtr,
 	    dstWrotePtr, dstCharsPtr);
-    if (encodingPtr->nullSize == 2) {
-	dst[*dstWrotePtr + 1] = '\0';
-    }
-    dst[*dstWrotePtr] = '\0';
+    memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize);
 
     return result;
 }
@@ -2335,6 +2353,198 @@ UtfToUtfProc(
     *dstCharsPtr = numChars;
     return result;
 }
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * Utf32ToUtfProc --
+ *
+ *	Convert from UTF-32 to UTF-8.
+ *
+ * Results:
+ *	Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ *	None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static int
+Utf32ToUtfProc(
+    ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
+    const char *src,		/* Source string in Unicode. */
+    int srcLen,			/* Source string length in bytes. */
+    int flags,			/* Conversion control flags. */
+    TCL_UNUSED(Tcl_EncodingState *),
+    char *dst,			/* Output buffer in which converted string is
+				 * stored. */
+    int dstLen,			/* The maximum length of output buffer in
+				 * bytes. */
+    int *srcReadPtr,		/* Filled with the number of bytes from the
+				 * source string that were converted. This may
+				 * be less than the original source length if
+				 * there was a problem converting some source
+				 * characters. */
+    int *dstWrotePtr,		/* Filled with the number of bytes that were
+				 * stored in the output buffer as a result of
+				 * the conversion. */
+    int *dstCharsPtr)		/* Filled with the number of characters that
+				 * correspond to the bytes stored in the
+				 * output buffer. */
+{
+    const char *srcStart, *srcEnd;
+    const char *dstEnd, *dstStart;
+    int result, numChars, charLimit = INT_MAX;
+    unsigned short ch;
+
+    flags |= PTR2INT(clientData);
+    if (flags & TCL_ENCODING_CHAR_LIMIT) {
+	charLimit = *dstCharsPtr;
+    }
+    result = TCL_OK;
+
+    /*
+     * Check alignment with utf-32 (4 == sizeof(UTF-32))
+     */
+
+    if ((srcLen % 4) != 0) {
+	result = TCL_CONVERT_MULTIBYTE;
+	srcLen &= -4;
+    }
+
+    srcStart = src;
+    srcEnd = src + srcLen;
+
+    dstStart = dst;
+    dstEnd = dst + dstLen - TCL_UTF_MAX;
+
+    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+	if (dst > dstEnd) {
+	    result = TCL_CONVERT_NOSPACE;
+	    break;
+	}
+
+	if (flags & TCL_ENCODING_LE) {
+	    ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
+	} else {
+	    ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
+	}
+
+	/*
+	 * Special case for 1-byte utf chars for speed. Make sure we work with
+	 * unsigned short-size data.
+	 */
+
+	if (ch && ch < 0x80) {
+	    *dst++ = (ch & 0xFF);
+	} else {
+	    dst += Tcl_UniCharToUtf(ch, dst);
+	}
+	src += sizeof(unsigned int);
+    }
+
+    *srcReadPtr = src - srcStart;
+    *dstWrotePtr = dst - dstStart;
+    *dstCharsPtr = numChars;
+    return result;
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfToUtf32Proc --
+ *
+ *	Convert from UTF-8 to UTF-32.
+ *
+ * Results:
+ *	Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ *	None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static int
+UtfToUtf32Proc(
+    ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
+    const char *src,		/* Source string in UTF-8. */
+    int srcLen,			/* Source string length in bytes. */
+    int flags,			/* Conversion control flags. */
+    TCL_UNUSED(Tcl_EncodingState *),
+    char *dst,			/* Output buffer in which converted string is
+				 * stored. */
+    int dstLen,			/* The maximum length of output buffer in
+				 * bytes. */
+    int *srcReadPtr,		/* Filled with the number of bytes from the
+				 * source string that were converted. This may
+				 * be less than the original source length if
+				 * there was a problem converting some source
+				 * characters. */
+    int *dstWrotePtr,		/* Filled with the number of bytes that were
+				 * stored in the output buffer as a result of
+				 * the conversion. */
+    int *dstCharsPtr)		/* Filled with the number of characters that
+				 * correspond to the bytes stored in the
+				 * output buffer. */
+{
+    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
+    int result, numChars;
+    int ch, len;
+
+    srcStart = src;
+    srcEnd = src + srcLen;
+    srcClose = srcEnd;
+    if ((flags & TCL_ENCODING_END) == 0) {
+	srcClose -= TCL_UTF_MAX;
+    }
+
+    dstStart = dst;
+    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);
+    flags |= PTR2INT(clientData);
+
+    result = TCL_OK;
+    for (numChars = 0; src < srcEnd; numChars++) {
+	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+	    /*
+	     * If there is more string to follow, this will ensure that the
+	     * last UTF-8 character in the source buffer hasn't been cut off.
+	     */
+
+	    result = TCL_CONVERT_MULTIBYTE;
+	    break;
+	}
+	if (dst > dstEnd) {
+	    result = TCL_CONVERT_NOSPACE;
+	    break;
+	}
+	len = TclUtfToUCS4(src, &ch);
+	if (!Tcl_UniCharIsUnicode(ch)) {
+	    if (flags & TCL_ENCODING_STOPONERROR) {
+		result = TCL_CONVERT_UNKNOWN;
+		break;
+	    }
+	    ch = 0xFFFD;
+	}
+	src += len;
+	if (flags & TCL_ENCODING_LE) {
+	    *dst++ = (ch & 0xFF);
+	    *dst++ = ((ch >> 8) & 0xff);
+	    *dst++ = ((ch >> 16) & 0xff);
+	    *dst++ = ((ch >> 24) & 0xff);
+	} else {
+	    *dst++ = ((ch >> 24) & 0xff);
+	    *dst++ = ((ch >> 16) & 0xff);
+	    *dst++ = ((ch >> 8) & 0xff);
+	    *dst++ = (ch & 0xFF);
+	}
+    }
+    *srcReadPtr = src - srcStart;
+    *dstWrotePtr = dst - dstStart;
+    *dstCharsPtr = numChars;
+    return result;
+}
 
 /*
  *-------------------------------------------------------------------------
@@ -3628,7 +3838,7 @@ GetTableEncoding(
 /*
  *---------------------------------------------------------------------------
  *
- * unilen --
+ * unilen/char16len --
  *
  *	A helper function for the Tcl_ExternalToUtf functions. This function
  *	is similar to strlen for double-byte characters: it returns the number
@@ -3644,7 +3854,7 @@ GetTableEncoding(
  */
 
 static size_t
-unilen(
+char16len(
     const char *src)
 {
     unsigned short *p;
@@ -3655,6 +3865,19 @@ unilen(
     }
     return (char *) p - src;
 }
+
+static size_t
+unilen(
+    const char *src)
+{
+    unsigned int *p;
+
+    p = (unsigned int *) src;
+    while (*p != 0x0000) {
+	p++;
+    }
+    return (char *) p - src;
+}
 
 /*
  *-------------------------------------------------------------------------
diff --git a/tests/encoding.test b/tests/encoding.test
index 25d0827..c6f4e02 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -287,6 +287,12 @@ test encoding-11.8 {encoding: extended Unicode UTF-16} {
 test encoding-11.9 {encoding: extended Unicode UTF-16} {
     viewable [encoding convertto utf-16be 😹]
 } {Ø=Þ9 (\u00D8=\u00DE9)}
+test encoding-11.10 {encoding: extended Unicode UTF-32} {
+    viewable [encoding convertto utf-32le 😹]
+} "9\xF6\x01\x00 (9\\u00F6\\u0001\\u0000)"
+test encoding-11.11 {encoding: extended Unicode UTF-32} {
+    viewable [encoding convertto utf-32be 😹]
+} "\x00\x01\xF69 (\\u0000\\u0001\\u00F69)"
 # OpenEncodingFile is fully tested by the rest of the tests in this file.
 
 test encoding-12.1 {LoadTableEncoding: normal encoding} {
@@ -461,10 +467,18 @@ test encoding-16.4 {Ucs2ToUtfProc} -body {
     set val [encoding convertfrom ucs-2 NN]
     list $val [format %x [scan $val %c]]
 } -result "乎 4e4e"
-test encoding-16.4 {Ucs2ToUtfProc} -body {
+test encoding-16.5 {Ucs2ToUtfProc} -body {
     set val [encoding convertfrom ucs-2 "\xD8\xD8\xDC\xDC"]
     list $val [format %x [scan $val %c]]
 } -result "\U460DC 460dc"
+test encoding-16.6 {Utf32ToUtfProc} -body {
+    set val [encoding convertfrom utf-32le NN\0\0]
+    list $val [format %x [scan $val %c]]
+} -result "乎 4e4e"
+test encoding-16.7 {Utf32ToUtfProc} -body {
+    set val [encoding convertfrom utf-32be \0\0NN]
+    list $val [format %x [scan $val %c]]
+} -result "乎 4e4e"
 
 test encoding-17.1 {UtfToUtf16Proc} -body {
     encoding convertto utf-16 "\U460DC"
@@ -478,6 +492,12 @@ test encoding-17.3 {UtfToUtf16Proc} -body {
 test encoding-17.4 {UtfToUtf16Proc} -body {
     encoding convertto utf-16le "\uD8D8"
 } -result "\xFD\xFF"
+test encoding-17.5 {UtfToUtf16Proc} -body {
+    encoding convertto utf-32le "\U460DC"
+} -result "\xDC\x60\x04\x00"
+test encoding-17.6 {UtfToUtf16Proc} -body {
+    encoding convertto utf-32be "\U460DC"
+} -result "\x00\x04\x60\xDC"
 
 test encoding-18.1 {TableToUtfProc} {
 } {}
@@ -777,7 +797,7 @@ test encoding-28.0 {all encodings load} -body {
 		llength $name
 	}
 	return $count
-} -result [expr {[info exists ::tcl_precision] ? 89 : 88}]
+} -result [expr {[info exists ::tcl_precision] ? 92 : 91}]
 
 runtests
 
-- 
cgit v0.12


From 82df0e7cc9a71bbd7eb06fe4bd1ee9e979b40a13 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 13 Oct 2021 13:51:26 +0000
Subject: Oops

---
 generic/tclEncoding.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 4166e45..5549874 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2396,7 +2396,7 @@ Utf32ToUtfProc(
     const char *srcStart, *srcEnd;
     const char *dstEnd, *dstStart;
     int result, numChars, charLimit = INT_MAX;
-    unsigned short ch;
+    int ch;
 
     flags |= PTR2INT(clientData);
     if (flags & TCL_ENCODING_CHAR_LIMIT) {
@@ -2436,7 +2436,7 @@ Utf32ToUtfProc(
 	 * unsigned short-size data.
 	 */
 
-	if (ch && ch < 0x80) {
+	if ((ch > 0) && (ch < 0x80)) {
 	    *dst++ = (ch & 0xFF);
 	} else {
 	    dst += Tcl_UniCharToUtf(ch, dst);
-- 
cgit v0.12


From 7e82249b7fbe90827c600cfd7fd3977ba18a54e2 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Thu, 14 Oct 2021 14:32:39 +0000
Subject: Code/Comment cleanup

---
 generic/tclEncoding.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 5549874..fad9faa 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -37,7 +37,7 @@ typedef struct {
 				 * end-of-string in this encoding. This number
 				 * is used to determine the source string
 				 * length when the srcLen argument is
-				 * negative. This number can be 1 or 2. */
+				 * negative. This number can be 1, 2, or 4. */
     ClientData clientData;	/* Arbitrary value associated with encoding
 				 * type. Passed to conversion functions. */
     LengthProc *lengthProc;	/* Function to compute length of
@@ -46,8 +46,8 @@ typedef struct {
 				 * nullSize is 2, this is a function that
 				 * returns the number of bytes in a 0x0000
 				 * terminated string; if nullSize is 4, this
-				 * is a function that returns the number of bytes
-				 * in a 0x00000000 terminated string. */
+				 * is a function that returns the number of
+				 * bytes in a 0x00000000 terminated string. */
     size_t refCount;		/* Number of uses of this structure. */
     Tcl_HashEntry *hPtr;	/* Hash table entry that owns this encoding. */
 } Encoding;
@@ -218,8 +218,8 @@ static Tcl_Channel		OpenEncodingFileChannel(Tcl_Interp *interp,
 static Tcl_EncodingFreeProc	TableFreeProc;
 static Tcl_EncodingConvertProc	TableFromUtfProc;
 static Tcl_EncodingConvertProc	TableToUtfProc;
-static size_t			char16len(const char *src);
-static size_t			unilen(const char *src);
+static size_t		unilen(const char *src);
+static size_t		unilen4(const char *src);
 static Tcl_EncodingConvertProc	Utf32ToUtfProc;
 static Tcl_EncodingConvertProc	UtfToUtf32Proc;
 static Tcl_EncodingConvertProc	Utf16ToUtfProc;
@@ -1077,9 +1077,9 @@ Tcl_CreateEncoding(
     encodingPtr->nullSize	= typePtr->nullSize;
     encodingPtr->clientData	= typePtr->clientData;
     if (typePtr->nullSize == 2) {
-	encodingPtr->lengthProc = (LengthProc *) char16len;
-    } else if (typePtr->nullSize == 4) {
 	encodingPtr->lengthProc = (LengthProc *) unilen;
+    } else if (typePtr->nullSize == 4) {
+	encodingPtr->lengthProc = (LengthProc *) unilen4;
     } else {
 	encodingPtr->lengthProc = (LengthProc *) strlen;
     }
@@ -1364,7 +1364,7 @@ Tcl_UtfToExternalDString(
 
 	src += srcRead;
 	if (result != TCL_CONVERT_NOSPACE) {
-		int i = soFar + encodingPtr->nullSize - 1;
+	    int i = soFar + encodingPtr->nullSize - 1;
 	    while (i >= soFar) {
 		Tcl_DStringSetLength(dstPtr, i--);
 	    }
@@ -2501,7 +2501,7 @@ UtfToUtf32Proc(
     }
 
     dstStart = dst;
-    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);
+    dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
     flags |= PTR2INT(clientData);
 
     result = TCL_OK;
@@ -2540,6 +2540,7 @@ UtfToUtf32Proc(
 	    *dst++ = (ch & 0xFF);
 	}
     }
+
     *srcReadPtr = src - srcStart;
     *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
@@ -2861,7 +2862,7 @@ UtfToUcs2Proc(
     *dstCharsPtr = numChars;
     return result;
 }
-
+
 /*
  *-------------------------------------------------------------------------
  *
@@ -3303,7 +3304,7 @@ TableFreeProc(
     ClientData clientData)	/* TableEncodingData that specifies
 				 * encoding. */
 {
-    TableEncodingData *dataPtr = (TableEncodingData *) clientData;
+    TableEncodingData *dataPtr = (TableEncodingData *)clientData;
 
     /*
      * Make sure we aren't freeing twice on shutdown. [Bug 219314]
@@ -3361,7 +3362,7 @@ EscapeToUtfProc(
 				 * correspond to the bytes stored in the
 				 * output buffer. */
 {
-    EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData;
+    EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
     const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd;
     const unsigned short *const *tableToUnicode;
     const Encoding *encodingPtr;
@@ -3838,7 +3839,7 @@ GetTableEncoding(
 /*
  *---------------------------------------------------------------------------
  *
- * unilen/char16len --
+ * unilen, unilen4 --
  *
  *	A helper function for the Tcl_ExternalToUtf functions. This function
  *	is similar to strlen for double-byte characters: it returns the number
@@ -3854,7 +3855,7 @@ GetTableEncoding(
  */
 
 static size_t
-char16len(
+unilen(
     const char *src)
 {
     unsigned short *p;
@@ -3867,13 +3868,13 @@ char16len(
 }
 
 static size_t
-unilen(
+unilen4(
     const char *src)
 {
     unsigned int *p;
 
     p = (unsigned int *) src;
-    while (*p != 0x0000) {
+    while (*p != 0x00000000) {
 	p++;
     }
     return (char *) p - src;
@@ -3909,7 +3910,7 @@ InitializeEncodingSearchPath(
     Tcl_Encoding *encodingPtr)
 {
     const char *bytes;
-    int i, numDirs;
+    int i, numDirs, numBytes;
     Tcl_Obj *libPathObj, *encodingObj, *searchPathObj;
 
     TclNewLiteralStringObj(encodingObj, "encoding");
@@ -3939,11 +3940,11 @@ InitializeEncodingSearchPath(
     if (*encodingPtr) {
 	((Encoding *)(*encodingPtr))->refCount++;
     }
-    bytes = TclGetString(searchPathObj);
+    bytes = Tcl_GetStringFromObj(searchPathObj, &numBytes);
 
-    *lengthPtr = searchPathObj->length;
-    *valuePtr = (char *)ckalloc(*lengthPtr + 1);
-    memcpy(*valuePtr, bytes, *lengthPtr + 1);
+    *lengthPtr = numBytes;
+    *valuePtr = (char *)ckalloc(numBytes + 1);
+    memcpy(*valuePtr, bytes, numBytes + 1);
     Tcl_DecrRefCount(searchPathObj);
 }
 
-- 
cgit v0.12