1 files changed, 140 insertions, 82 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index d235911..cb5dd20 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -3009,6 +3009,7 @@ Utf16ToUtfProc(
 	srcLen--;
     }
 
+#if 0
     /*
      * If last code point is a high surrogate, we cannot handle that yet,
      * unless we are at the end.
@@ -3019,6 +3020,7 @@ Utf16ToUtfProc(
 	result = TCL_CONVERT_MULTIBYTE;
 	srcLen-= 2;
     }
+#endif
 
     srcStart = src;
     srcEnd = src + srcLen;
@@ -3028,7 +3030,7 @@ Utf16ToUtfProc(
 
     for (numChars = 0; src < srcEnd && numChars <= charLimit;
 	    src += 2, numChars++) {
-	if (dst > dstEnd) {
+	if (dst > dstEnd && !HIGH_SURROGATE(ch)) {
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
@@ -3039,94 +3041,150 @@ Utf16ToUtfProc(
 	} else {
 	    ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
 	}
-	if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
-	    if (PROFILE_STRICT(flags)) {
-		result = TCL_CONVERT_SYNTAX;
-		src -= 2;	/* Go back to beginning of high surrogate */
-		dst--;		/* Also undo writing a single byte too much */
-		numChars--;
-		break;
-	    } else if (PROFILE_REPLACE(flags)) {
-		/*
-		 * Previous loop wrote a single byte to mark the high surrogate.
-		 * Replace it with the replacement character. Further, restart
-		 * current loop iteration since need to recheck destination
-		 * space and reset processing of current character.
-		 */
-		ch = UNICODE_REPLACE_CHAR;
-		dst--;
-		dst += Tcl_UniCharToUtf(ch, dst);
-		src -= 2;
-		numChars--;
-		continue;
-	    } else {
-		/*
-		 * Bug [10c2c17c32]. If Hi surrogate not followed by Lo
-		 * surrogate, finish 3-byte UTF-8
-		 */
-		dst += Tcl_UniCharToUtf(-1, dst);
-	    }
-	}
-
-	/*
-	 * Special case for 1-byte utf chars for speed. Make sure we work with
-	 * unsigned short-size data.
-	 */
-
-	if ((unsigned)ch - 1 < 0x7F) {
-	    *dst++ = (ch & 0xFF);
-	} else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
-	    dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
-            if (HIGH_SURROGATE(prev) && LOW_SURROGATE(ch)) {
-                --numChars; /* Character has been combined, so compensage count */
+	if (HIGH_SURROGATE(prev)) {
+            if (LOW_SURROGATE(ch)) {
+                /*
+                 * High surrogate was followed by a low surrogate.
+                 * Tcl_UniCharToUtf would have stashed away the state in dst.
+                 * Call it again to combine that state with the low surrogate.
+                 * We also have to compensate the numChars as two UTF-16 units
+                 * have been combined into one character.
+                 */
+                dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
+                --numChars;
+            } else {
+                /* High surrogate was not followed by a low surrogate */
+                if (PROFILE_STRICT(flags)) {
+                    result = TCL_CONVERT_SYNTAX;
+                    src -= 2;	/* Go back to beginning of high surrogate */
+                    dst--;		/* Also undo writing a single byte too much */
+                    numChars--;
+                    break;
+                }
+                if (PROFILE_REPLACE(flags)) {
+                    /*
+                     * Previous loop wrote a single byte to mark the high surrogate.
+                     * Replace it with the replacement character. Further, restart
+                     * current loop iteration since need to recheck destination
+                     * space and reset processing of current character.
+                     */
+                    ch = UNICODE_REPLACE_CHAR;
+                    dst--;
+                    dst += Tcl_UniCharToUtf(ch, dst);
+                } else {
+                    /*
+                     * Bug [10c2c17c32]. If Hi surrogate not followed by Lo
+                     * surrogate, finish 3-byte UTF-8
+                     */
+                    dst += Tcl_UniCharToUtf(-1, dst);
+                }
+                /* Loop around again so destination space and other checks are done */
+                prev = 0; /* Reset high surrogate tracker */
+                src -= 2;
+                numChars--;
             }
-	} else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) {
-	    /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */
-	    if (PROFILE_STRICT(flags)) {
-		result = TCL_CONVERT_SYNTAX;
-		break;
-	    } else {
-		/* PROFILE_REPLACE */
-		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
-	    }
 	} else {
-	    dst += Tcl_UniCharToUtf(ch, dst);
-	}
-    }
+            /* Previous char was not a high surrogate */
 
-    if (HIGH_SURROGATE(ch)) {
-	if (PROFILE_STRICT(flags)) {
-	    result = TCL_CONVERT_SYNTAX;
-	    src -= 2;
-	    dst--;
-	    numChars--;
-	} else if (PROFILE_REPLACE(flags)) {
-	    dst--;
-	    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
-	} else {
-	    /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
-	    dst += Tcl_UniCharToUtf(-1, dst);
-	}
+            /*
+             * Special case for 1-byte utf chars for speed. Make sure we work with
+             * unsigned short-size data. Order checks based on expected frequency.
+             */
+            if ((unsigned)ch - 1 < 0x7F) {
+                /* ASCII except nul */
+                *dst++ = (ch & 0xFF);
+            } else if (!SURROGATE(ch)) {
+                /* Not ASCII, not surrogate */
+                dst += Tcl_UniCharToUtf(ch, dst);
+            } else if (HIGH_SURROGATE(ch)) {
+                dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
+            } else {
+                assert(LOW_SURROGATE(ch));
+                if (PROFILE_STRICT(flags)) {
+                    result = TCL_CONVERT_SYNTAX;
+                    break;
+                }
+                if (PROFILE_REPLACE(flags)) {
+                    ch = UNICODE_REPLACE_CHAR;
+                }
+                dst += Tcl_UniCharToUtf(ch, dst);
+            }
+        }
     }
 
     /*
-     * If we had a truncated code unit at the end AND this is the last
-     * fragment AND profile is not "strict", stick FFFD in its place.
+     * When the above loop ends, result may have the following values:
+     * 1. TCL_OK - full source buffer was completely processed.
+     *    src, dst, numChars will hold values up to that point BUT
+     *    there may be a leftover high surrogate we need to deal with.
+     * 2. TCL_CONVERT_NOSPACE - Ran out of room in the destination buffer.
+     *    Same considerations as (1)
+     * 3. TCL_CONVERT_SYNTAX - decoding error. src, dst, numChars will
+     *    hold the correct values up to the point of error even if the
+     *    the last character decoded was a high surrogate.
+     * 4. TCL_CONVERT_MULTIBYTE - the buffer passed in was not fully
+     *    processed, because there was a trailing single byte. However,
+     *    we may have processed the requested number of characters already
+     *    in which case the trailing byte does not matter. We still
+     *    may still be a leftover high surrogate as in (1) and (2).
      */
-    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
-	if (dst > dstEnd) {
-	    result = TCL_CONVERT_NOSPACE;
-	} else {
-	    if (PROFILE_STRICT(flags)) {
-		result = TCL_CONVERT_SYNTAX;
-	    } else {
-		/* PROFILE_REPLACE or PROFILE_TCL8 */
-		result = TCL_OK;
-		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
-		numChars++;
-		src++; /* Go past truncated code unit */
-	    }
-	}
+    switch (result) {
+    case TCL_CONVERT_MULTIBYTE: /* FALLTHRU */
+    case TCL_OK: /* FALLTHRU */
+    case TCL_CONVERT_NOSPACE:
+        if (HIGH_SURROGATE(ch)) {
+            if (flags & TCL_ENCODING_END) {
+                /*
+                 * No more data expected. There will be space for output of
+                 * one character (essentially overwriting the dst area holding
+                 * high surrogate state)
+                 */
+                assert((dst-1) <= dstEnd);
+                if (PROFILE_STRICT(flags)) {
+                    result = TCL_CONVERT_SYNTAX;
+                    src -= 2;
+                    dst--;
+                    numChars--;
+                } else if (PROFILE_REPLACE(flags)) {
+                    dst--;
+                    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+                } else {
+                    /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+                    dst += Tcl_UniCharToUtf(-1, dst);
+                }
+            } else {
+                /* More data is expected. Revert the surrogate state */
+                src -= 2;
+                dst--;
+                numChars--;
+                /* Note: leave result of TCL_CONVERT_NOSPACE as is */
+                if (result == TCL_OK) {
+                    result = TCL_CONVERT_MULTIBYTE;
+                }
+            }
+        } else if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
+            /*
+             * If we had a trailing byte at the end AND this is the last
+             * fragment AND profile is not "strict", stick FFFD in its place.
+             * Note in this case we DO need to check for room in dst.
+             */
+            if (dst > dstEnd) {
+                result = TCL_CONVERT_NOSPACE;
+            } else {
+                if (PROFILE_STRICT(flags)) {
+                    result = TCL_CONVERT_SYNTAX;
+                } else {
+                    /* PROFILE_REPLACE or PROFILE_TCL8 */
+                    result = TCL_OK;
+                    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+                    numChars++;
+                    src++;
+                }
+            }
+        }
+        break;
+    case TCL_CONVERT_SYNTAX:
+        break; /* Nothing to do */
     }
 
     *srcReadPtr = src - srcStart;