1 files changed, 98 insertions, 200 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 3ef4e58..8a29722 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -220,14 +220,7 @@ static size_t			unilen(const char *src);
 static Tcl_EncodingConvertProc	Utf16ToUtfProc;
 static Tcl_EncodingConvertProc	UtfToUtf16Proc;
 static Tcl_EncodingConvertProc	UtfToUcs2Proc;
-static int              	UtfToUtfProc(ClientData clientData,
-	                            const char *src, int srcLen, int flags,
-       		                    Tcl_EncodingState *statePtr, char *dst,
-				    int dstLen, int *srcReadPtr,
-				    int *dstWrotePtr, int *dstCharsPtr,
-				    int pureNullMode);
-static Tcl_EncodingConvertProc	UtfIntToUtfExtProc;
-static Tcl_EncodingConvertProc	UtfExtToUtfIntProc;
+static Tcl_EncodingConvertProc	UtfToUtfProc;
 static Tcl_EncodingConvertProc	Iso88591FromUtfProc;
 static Tcl_EncodingConvertProc	Iso88591ToUtfProc;
 
@@ -517,6 +510,10 @@ FillEncodingFileMap(void)
  *---------------------------------------------------------------------------
  */
 
+/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
+#define TCL_ENCODING_MODIFIED	0x20	/* Converting NULL bytes to 0xC0 0x80 */
+#define TCL_ENCODING_LE		0x80	/* Little-endian encoding, for ucs-2/utf-16 only */
+
 void
 TclInitEncodingSubsystem(void)
 {
@@ -525,7 +522,7 @@ TclInitEncodingSubsystem(void)
     unsigned size;
     unsigned short i;
     union {
-        char c;
+        unsigned char c;
         short s;
     } isLe;
 
@@ -533,7 +530,7 @@ TclInitEncodingSubsystem(void)
 	return;
     }
 
-    isLe.s = 1;
+    isLe.s = TCL_ENCODING_LE;
     Tcl_MutexLock(&encodingMutex);
     Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
     Tcl_MutexUnlock(&encodingMutex);
@@ -553,8 +550,8 @@ TclInitEncodingSubsystem(void)
     tclIdentityEncoding = Tcl_CreateEncoding(&type);
 
     type.encodingName	= "utf-8";
-    type.toUtfProc	= UtfExtToUtfIntProc;
-    type.fromUtfProc	= UtfIntToUtfExtProc;
+    type.toUtfProc	= UtfToUtfProc;
+    type.fromUtfProc	= UtfToUtfProc;
     type.freeProc	= NULL;
     type.nullSize	= 1;
     type.clientData	= NULL;
@@ -565,7 +562,7 @@ TclInitEncodingSubsystem(void)
     type.freeProc	= NULL;
     type.nullSize	= 2;
     type.encodingName   = "ucs-2le";
-    type.clientData	= INT2PTR(1);
+    type.clientData	= INT2PTR(TCL_ENCODING_LE);
     Tcl_CreateEncoding(&type);
     type.encodingName   = "ucs-2be";
     type.clientData	= INT2PTR(0);
@@ -579,7 +576,7 @@ TclInitEncodingSubsystem(void)
     type.freeProc	= NULL;
     type.nullSize	= 2;
     type.encodingName   = "utf-16le";
-    type.clientData	= INT2PTR(1);
+    type.clientData	= INT2PTR(TCL_ENCODING_LE);
     Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16be";
     type.clientData	= INT2PTR(0);
@@ -1077,7 +1074,7 @@ Tcl_ExternalToUtfDString(
 	srcLen = encodingPtr->lengthProc(src);
     }
 
-    flags = TCL_ENCODING_START | TCL_ENCODING_END;
+    flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED;
 
     while (1) {
 	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
@@ -1186,25 +1183,24 @@ Tcl_ExternalToUtf(
     if (!noTerminate) {
 	/*
 	 * If there are any null characters in the middle of the buffer,
-	 * they will converted to the UTF-8 null character (\xC080). To get
+	 * they will converted to the UTF-8 null character (\xC0\x80). To get
 	 * the actual \0 at the end of the destination buffer, we need to
 	 * append it manually.  First make room for it...
 	 */
 
 	dstLen--;
     }
+    flags |= TCL_ENCODING_MODIFIED;
     do {
-	int savedFlags = flags;
 	Tcl_EncodingState savedState = *statePtr;
 
 	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
-		flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
+		flags , statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
 		dstCharsPtr);
 	if (*dstCharsPtr <= maxChars) {
 	    break;
 	}
 	dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
-	flags = savedFlags;
 	*statePtr = savedState;
     } while (1);
     if (!noTerminate) {
@@ -1269,8 +1265,8 @@ Tcl_UtfToExternalDString(
     flags = TCL_ENCODING_START | TCL_ENCODING_END;
     while (1) {
 	result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
-		srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
-		&dstChars);
+		srcLen, flags, &state, dst, dstLen,
+		&srcRead, &dstWrote, &dstChars);
 	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
 
 	if (result != TCL_CONVERT_NOSPACE) {
@@ -1371,8 +1367,8 @@ Tcl_UtfToExternal(
 
     dstLen -= encodingPtr->nullSize;
     result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
-	    flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
-	    dstCharsPtr);
+	    flags, statePtr, dst, dstLen, srcReadPtr,
+	    dstWrotePtr, dstCharsPtr);
     if (encodingPtr->nullSize == 2) {
 	dst[*dstWrotePtr + 1] = '\0';
     }
@@ -2095,104 +2091,6 @@ BinaryProc(
 /*
  *-------------------------------------------------------------------------
  *
- * UtfIntToUtfExtProc --
- *
- *	Convert from UTF-8 to UTF-8. While converting null-bytes from the
- *	Tcl's internal representation (0xC0, 0x80) to the official
- *	representation (0x00). See UtfToUtfProc for details.
- *
- * Results:
- *	Returns TCL_OK if conversion was successful.
- *
- * Side effects:
- *	None.
- *
- *-------------------------------------------------------------------------
- */
-
-static int
-UtfIntToUtfExtProc(
-    ClientData clientData,
-    const char *src,		/* Source string in UTF-8. */
-    int srcLen,			/* Source string length in bytes. */
-    int flags,			/* Conversion control flags. */
-    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
-				 * information used during a piecewise
-				 * conversion. Contents of statePtr are
-				 * initialized and/or reset by conversion
-				 * routine under control of flags argument. */
-    char *dst,			/* Output buffer in which converted string
-				 * is stored. */
-    int dstLen,			/* The maximum length of output buffer in
-				 * bytes. */
-    int *srcReadPtr,		/* Filled with the number of bytes from the
-				 * source string that were converted. This may
-				 * be less than the original source length if
-				 * there was a problem converting some source
-				 * characters. */
-    int *dstWrotePtr,		/* Filled with the number of bytes that were
-				 * stored in the output buffer as a result of
-				 * the conversion. */
-    int *dstCharsPtr)		/* Filled with the number of characters that
-				 * correspond to the bytes stored in the
-				 * output buffer. */
-{
-    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
-	    srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
-}
-
-/*
- *-------------------------------------------------------------------------
- *
- * UtfExtToUtfIntProc --
- *
- *	Convert from UTF-8 to UTF-8 while converting null-bytes from the
- *	official representation (0x00) to Tcl's internal representation (0xC0,
- *	0x80). See UtfToUtfProc for details.
- *
- * Results:
- *	Returns TCL_OK if conversion was successful.
- *
- * Side effects:
- *	None.
- *
- *-------------------------------------------------------------------------
- */
-
-static int
-UtfExtToUtfIntProc(
-    ClientData clientData,
-    const char *src,		/* Source string in UTF-8. */
-    int srcLen,			/* Source string length in bytes. */
-    int flags,			/* Conversion control flags. */
-    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
-				 * information used during a piecewise
-				 * conversion. Contents of statePtr are
-				 * initialized and/or reset by conversion
-				 * routine under control of flags argument. */
-    char *dst,			/* Output buffer in which converted string is
-				 * stored. */
-    int dstLen,			/* The maximum length of output buffer in
-				 * bytes. */
-    int *srcReadPtr,		/* Filled with the number of bytes from the
-				 * source string that were converted. This may
-				 * be less than the original source length if
-				 * there was a problem converting some source
-				 * characters. */
-    int *dstWrotePtr,		/* Filled with the number of bytes that were
-				 * stored in the output buffer as a result of
-				 * the conversion. */
-    int *dstCharsPtr)		/* Filled with the number of characters that
-				 * correspond to the bytes stored in the
-				 * output buffer. */
-{
-    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
-	    srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
-}
-
-/*
- *-------------------------------------------------------------------------
- *
  * UtfToUtfProc --
  *
  *	Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
@@ -2214,11 +2112,7 @@ UtfToUtfProc(
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
-    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
-				 * information used during a piecewise
-				 * conversion. Contents of statePtr are
-				 * initialized and/or reset by conversion
-				 * routine under control of flags argument. */
+    TCL_UNUSED(Tcl_EncodingState *),
     char *dst,			/* Output buffer in which converted string is
 				 * stored. */
     int dstLen,			/* The maximum length of output buffer in
@@ -2231,21 +2125,15 @@ UtfToUtfProc(
     int *dstWrotePtr,		/* Filled with the number of bytes that were
 				 * stored in the output buffer as a result of
 				 * the conversion. */
-    int *dstCharsPtr,		/* Filled with the number of characters that
+    int *dstCharsPtr)		/* Filled with the number of characters that
 				 * correspond to the bytes stored in the
 				 * output buffer. */
-    int pureNullMode)		/* Convert embedded nulls from internal
-				 * representation to real null-bytes or vice
-				 * versa. Also combine or separate surrogate pairs */
 {
     const char *srcStart, *srcEnd, *srcClose;
     const char *dstStart, *dstEnd;
     int result, numChars, charLimit = INT_MAX;
-    int *chPtr = (int *) statePtr;
+    int ch;
 
-    if (flags & TCL_ENCODING_START) {
-    	*statePtr = 0;
-    }
     result = TCL_OK;
 
     srcStart = src;
@@ -2262,7 +2150,7 @@ UtfToUtfProc(
     dstEnd = dst + dstLen - TCL_UTF_MAX;
 
     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
-	if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
+	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
 	    /*
 	     * If there is more string to follow, this will ensure that the
 	     * last UTF-8 character in the source buffer hasn't been cut off.
@@ -2275,48 +2163,62 @@ UtfToUtfProc(
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+	if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && (flags & TCL_ENCODING_MODIFIED))) {
 	    /*
 	     * Copy 7bit characters, but skip null-bytes when we are in input
 	     * mode, so that they get converted to 0xC080.
 	     */
 
 	    *dst++ = *src++;
-	} else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 &&
-		(src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) {
+	} else if (UCHAR(*src) == 0xC0 && (src + 1 < srcEnd)
+		&& UCHAR(src[1]) == 0x80 && !(flags & TCL_ENCODING_MODIFIED)) {
 	    /*
 	     * Convert 0xC080 to real nulls when we are in output mode.
 	     */
 
 	    *dst++ = 0;
 	    src += 2;
-	} else if (!TclUCS4Complete(src, srcEnd - src)) {
+	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
 	    /*
 	     * Always check before using TclUtfToUCS4. Not doing can so
 	     * cause it run beyond the end of the buffer! If we happen such an
-	     * incomplete char its bytes are made to represent themselves.
+	     * incomplete char its bytes are made to represent themselves
+	     * unless the user has explicitly asked to be told.
 	     */
 
-	    *chPtr = UCHAR(*src);
+	    if (flags & TCL_ENCODING_STOPONERROR) {
+		result = TCL_CONVERT_MULTIBYTE;
+		break;
+	    }
+	    ch = UCHAR(*src);
 	    src += 1;
-	    dst += Tcl_UniCharToUtf(*chPtr, dst);
+	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
-	    src += TclUtfToUCS4(src, chPtr);
-	    if ((*chPtr | 0x7FF) == 0xDFFF) {
-		/* A surrogate character is detected, handle especially */
-		int low = *chPtr;
-		size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
-		if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
-			*dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
-			*dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
-			*dst++ = (char) ((*chPtr | 0x80) & 0xBF);
-			continue;
+	    size_t len = TclUtfToUCS4(src, &ch);
+	    if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) {
+		result = TCL_CONVERT_SYNTAX;
+		break;
+	    }
+	    src += len;
+	    if ((ch | 0x7FF) == 0xDFFF) {
+		/*
+		 * A surrogate character is detected, handle especially.
+		 */
+
+		int low = ch;
+		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
+
+		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
+		    *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
+		    *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
+		    *dst++ = (char) ((ch | 0x80) & 0xBF);
+		    continue;
 		}
 		src += len;
-		dst += Tcl_UniCharToUtf(*chPtr, dst);
-		*chPtr = low;
+		dst += Tcl_UniCharToUtf(ch, dst);
+		ch = low;
 	    }
-	    dst += Tcl_UniCharToUtf(*chPtr, dst);
+	    dst += Tcl_UniCharToUtf(ch, dst);
 	}
     }
 
@@ -2344,7 +2246,7 @@ UtfToUtfProc(
 
 static int
 Utf16ToUtfProc(
-    ClientData clientData,	/* != NULL means LE, == NUL means BE */
+    ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
     const char *src,		/* Source string in Unicode. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
@@ -2370,18 +2272,27 @@ Utf16ToUtfProc(
     int result, numChars, charLimit = INT_MAX;
     unsigned short ch;
 
+    flags |= PTR2INT(clientData);
     if (flags & TCL_ENCODING_CHAR_LIMIT) {
 	charLimit = *dstCharsPtr;
     }
     result = TCL_OK;
 
-    /* check alignment with utf-16 (2 == sizeof(UTF-16)) */
+    /*
+     * Check alignment with utf-16 (2 == sizeof(UTF-16))
+     */
+
     if ((srcLen % 2) != 0) {
 	result = TCL_CONVERT_MULTIBYTE;
 	srcLen--;
     }
-    /* If last code point is a high surrogate, we cannot handle that yet */
-    if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) {
+
+    /*
+     * If last code point is a high surrogate, we cannot handle that yet.
+     */
+
+    if ((srcLen >= 2) &&
+	    ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
 	result = TCL_CONVERT_MULTIBYTE;
 	srcLen-= 2;
     }
@@ -2398,15 +2309,17 @@ Utf16ToUtfProc(
 	    break;
 	}
 
-	if (clientData) {
+	if (flags & TCL_ENCODING_LE) {
 	    ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
 	} else {
 	    ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
 	}
+
 	/*
 	 * Special case for 1-byte utf chars for speed. Make sure we work with
 	 * unsigned short-size data.
 	 */
+
 	if (ch && ch < 0x80) {
 	    *dst++ = (ch & 0xFF);
 	} else {
@@ -2443,11 +2356,7 @@ UtfToUtf16Proc(
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
-    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
-				 * information used during a piecewise
-				 * conversion. Contents of statePtr are
-				 * initialized and/or reset by conversion
-				 * routine under control of flags argument. */
+    TCL_UNUSED(Tcl_EncodingState *),
     char *dst,			/* Output buffer in which converted string is
 				 * stored. */
     int dstLen,			/* The maximum length of output buffer in
@@ -2466,11 +2375,8 @@ UtfToUtf16Proc(
 {
     const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
     int result, numChars;
-    Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+    int ch;
 
-    if (flags & TCL_ENCODING_START) {
-    	*statePtr = 0;
-    }
     srcStart = src;
     srcEnd = src + srcLen;
     srcClose = srcEnd;
@@ -2496,38 +2402,27 @@ UtfToUtf16Proc(
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	src += TclUtfToUniChar(src, chPtr);
-
+	src += TclUtfToUCS4(src, &ch);
 	if (clientData) {
-#if TCL_UTF_MAX > 3
-	    if (*chPtr <= 0xFFFF) {
-		*dst++ = (*chPtr & 0xFF);
-		*dst++ = (*chPtr >> 8);
+	    if (ch <= 0xFFFF) {
+		*dst++ = (ch & 0xFF);
+		*dst++ = (ch >> 8);
 	    } else {
-		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
-		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
-		*dst++ = (*chPtr & 0xFF);
-		*dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
+		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+		*dst++ = (ch & 0xFF);
+		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
 	    }
-#else
-	    *dst++ = (*chPtr & 0xFF);
-	    *dst++ = (*chPtr >> 8);
-#endif
 	} else {
-#if TCL_UTF_MAX > 3
-	    if (*chPtr <= 0xFFFF) {
-		*dst++ = (*chPtr >> 8);
-		*dst++ = (*chPtr & 0xFF);
+	    if (ch <= 0xFFFF) {
+		*dst++ = (ch >> 8);
+		*dst++ = (ch & 0xFF);
 	    } else {
-		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
-		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
-		*dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
-		*dst++ = (*chPtr & 0xFF);
+		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
+		*dst++ = (ch & 0xFF);
 	    }
-#else
-	    *dst++ = (*chPtr >> 8);
-	    *dst++ = (*chPtr & 0xFF);
-#endif
 	}
     }
     *srcReadPtr = src - srcStart;
@@ -2554,7 +2449,7 @@ UtfToUtf16Proc(
 
 static int
 UtfToUcs2Proc(
-    ClientData clientData,	/* != NULL means LE, == NUL means BE */
+    ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
@@ -2582,6 +2477,7 @@ UtfToUcs2Proc(
 #endif
     Tcl_UniChar ch = 0;
 
+    flags |= PTR2INT(clientData);
     srcStart = src;
     srcEnd = src + srcLen;
     srcClose = srcEnd;
@@ -2625,7 +2521,7 @@ UtfToUcs2Proc(
 	 * casting dst to a Tcl_UniChar. [Bug 1122671]
 	 */
 
-	if (clientData) {
+	if (flags & TCL_ENCODING_LE) {
 	    *dst++ = (ch & 0xFF);
 	    *dst++ = (ch >> 8);
 	} else {
@@ -3033,7 +2929,9 @@ Iso88591FromUtfProc(
 		break;
 	    }
 #if TCL_UTF_MAX <= 3
-	    if ((ch >= 0xD800) && (len < 3)) len = 4;
+	    if ((ch >= 0xD800) && (len < 3)) {
+		len = 4;
+	    }
 #endif
 	    /*
 	     * Plunge on, using '?' as a fallback character.
@@ -3078,7 +2976,7 @@ TableFreeProc(
     ClientData clientData)	/* TableEncodingData that specifies
 				 * encoding. */
 {
-    TableEncodingData *dataPtr = (TableEncodingData *)clientData;
+    TableEncodingData *dataPtr = (TableEncodingData *) clientData;
 
     /*
      * Make sure we aren't freeing twice on shutdown. [Bug 219314]
@@ -3136,7 +3034,7 @@ EscapeToUtfProc(
 				 * correspond to the bytes stored in the
 				 * output buffer. */
 {
-    EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
+    EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData;
     const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd;
     const unsigned short *const *tableToUnicode;
     const Encoding *encodingPtr;