5 files changed, 237 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index dfe776c..69684e9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1552,6 +1552,12 @@
 	* library/tzdata/America/Kralendijk: (new)
 	* library/tzdata/America/Lower_Princes: (new)
 
+2011-07-27  Donal K. Fellows  <dkf@users.sf.net>
+
+	* generic/tclEncoding.c (UtfToUtfProc): Start to rough out what needs
+	to change to transition Tcl to being able to work with non-BMP
+	characters, at least at a basic level.
+
 2011-07-26  Donal K. Fellows  <dkf@users.sf.net>
 
 	* generic/tclOO.c (initScript): Ensure that TclOO is properly found by
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 7a55724..4871b85 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -191,6 +191,14 @@ static Tcl_Encoding systemEncoding;
 static unsigned short emptyPage[256];
 
 /*
+ * Constants used in the (external) UTF-8 <--> (internal) Modified UTF-8
+ * conversion code.
+ */
+
+#define FROM_STANDARD_UTF8	0
+#define TO_STANDARD_UTF8	1
+
+/*
  * Functions used only in this module.
  */
 
@@ -2160,7 +2168,7 @@ UtfIntToUtfExtProc(
 				 * output buffer. */
 {
     return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
-	    srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
+	    srcReadPtr, dstWrotePtr, dstCharsPtr, TO_STANDARD_UTF8);
 }
 
 /*
@@ -2209,7 +2217,7 @@ UtfExtToUtfIntProc(
 				 * output buffer. */
 {
     return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
-	    srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
+	    srcReadPtr, dstWrotePtr, dstCharsPtr, FROM_STANDARD_UTF8);
 }
 
 /*
@@ -2230,6 +2238,153 @@ UtfExtToUtfIntProc(
  *-------------------------------------------------------------------------
  */
 
+static INLINE int
+IntToUtf(
+    unsigned ch,		/* The character to be stored in the
+				 * buffer. */
+    char *buf)			/* Buffer in which the UTF-8 representation of
+				 * the character is stored. Buffer must be
+				 * large enough to hold the UTF-8 character
+				 * (at most 6 bytes). */
+{
+    if ((ch > 0) && (ch < 0x80)) {
+	buf[0] = (char) ch;
+	return 1;
+    }
+    if (ch <= 0x7FF) {
+	buf[1] = (char) ((ch | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 6) | 0xC0);
+	return 2;
+    }
+    if (ch <= 0xFFFF) {
+    three:
+	buf[2] = (char) ((ch | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 12) | 0xE0);
+	return 3;
+    }
+    if (ch <= 0x1FFFFF) {
+	buf[3] = (char) ((ch | 0x80) & 0xBF);
+	buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 18) | 0xF0);
+	return 4;
+    }
+    if (ch <= 0x3FFFFFF) {
+	buf[4] = (char) ((ch | 0x80) & 0xBF);
+	buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 24) | 0xF8);
+	return 5;
+    }
+    if (ch <= 0x7FFFFFFF) {
+	buf[5] = (char) ((ch | 0x80) & 0xBF);
+	buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
+	buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 30) | 0xFC);
+	return 6;
+    }
+
+    ch = 0xFFFD;
+    goto three;
+}
+
+static INLINE int
+UtfToInt(
+    const char *src,		/* The UTF-8 string. */
+    unsigned *chPtr)		/* Filled with the character represented by
+				 * the front of the UTF-8 string. */
+{
+    register int byte;
+
+    /*
+     * Unroll 1 to 6 byte UTF-8 sequences, use loop to handle longer ones.
+     */
+
+    byte = *((unsigned char *) src);
+    if (byte < 0xC0) {
+	/*
+	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
+	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
+	 * characters representing themselves.
+	 */
+
+	*chPtr = (Tcl_UniChar) byte;
+	return 1;
+    } else if (byte < 0xE0) {
+	if ((src[1] & 0xC0) == 0x80) {
+	    /*
+	     * Two-byte-character lead-byte followed by a trail-byte.
+	     */
+
+	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
+	    return 2;
+	}
+
+	/*
+	 * A two-byte-character lead-byte not followed by trail-byte
+	 * represents itself.
+	 */
+
+	*chPtr = (Tcl_UniChar) byte;
+	return 1;
+    } else if (byte < 0xF0) {
+	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
+	    /*
+	     * Three-byte-character lead byte followed by two trail bytes.
+	     */
+
+	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
+		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
+	    return 3;
+	}
+
+	/*
+	 * A three-byte-character lead-byte not followed by two trail-bytes
+	 * represents itself.
+	 */
+
+	*chPtr = (Tcl_UniChar) byte;
+	return 1;
+    } else {
+	int ch, total, trail;
+	static const unsigned char totalBytes[256] = {
+	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+	    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
+	};
+
+	total = totalBytes[byte];
+	trail = total - 1;
+	if (trail > 0) {
+	    ch = byte & (0x3F >> trail);
+	    do {
+		src++;
+		if ((*src & 0xC0) != 0x80) {
+		    *chPtr = byte;
+		    return 1;
+		}
+		ch <<= 6;
+		ch |= (*src & 0x3F);
+		trail--;
+	    } while (trail > 0);
+	    *chPtr = ch;
+	    return total;
+	} else {
+	    *chPtr = (Tcl_UniChar) byte;
+	    return 1;
+	}
+    }
+}
+
 static int
 UtfToUtfProc(
     ClientData clientData,	/* Not used. */
@@ -2256,7 +2411,7 @@ UtfToUtfProc(
     int *dstCharsPtr,		/* Filled with the number of characters that
 				 * correspond to the bytes stored in the
 				 * output buffer. */
-    int pureNullMode)		/* Convert embedded nulls from internal
+    int conversionMode)		/* Convert embedded nulls from internal
 				 * representation to real null-bytes or vice
 				 * versa. */
 {
@@ -2291,15 +2446,16 @@ UtfToUtfProc(
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+	if (UCHAR(*src) < 0x80 &&
+		!(UCHAR(*src) == 0 && conversionMode == FROM_STANDARD_UTF8)) {
 	    /*
 	     * Copy 7bit chatacters, but skip null-bytes when we are in input
 	     * mode, so that they get converted to 0xc080.
 	     */
 
 	    *dst++ = *src++;
-	} else if (pureNullMode == 1 && UCHAR(*src) == 0xc0 &&
-		UCHAR(*(src+1)) == 0x80) {
+	} else if (conversionMode == TO_STANDARD_UTF8 && UCHAR(*src) == 0xc0
+		&& UCHAR(*(src+1)) == 0x80) {
 	    /*
 	     * Convert 0xc080 to real nulls when we are in output mode.
 	     */
@@ -2310,15 +2466,51 @@ UtfToUtfProc(
 	    /*
 	     * Always check before using Tcl_UtfToUniChar. Not doing can so
 	     * cause it run beyond the endof the buffer! If we happen such an
-	     * incomplete char its byts are made to represent themselves.
+	     * incomplete char, its bytes are made to represent themselves.
 	     */
 
 	    ch = (unsigned char) *src;
 	    src += 1;
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
-	    src += Tcl_UtfToUniChar(src, &ch);
-	    dst += Tcl_UniCharToUtf(ch, dst);
+	    /*
+	     * This is where we ought to do surrogate pair handling, with the
+	     * correct way of doing it depending on the conversionMode
+	     * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE!
+	     */
+
+	    if (conversionMode == TO_STANDARD_UTF8) {
+		const char *origin = src;
+
+		src += Tcl_UtfToUniChar(src, &ch);
+		if (ch >= 0xD800 && ch < 0xDBFF) {
+		    unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;
+
+		    src += Tcl_UtfToUniChar(src, &ch);
+		    if (ch >= 0xDC00 && ch < 0xDFFF) {
+			fullChar += 0x2400 + (unsigned) ch;
+			dst += IntToUtf(fullChar, dst);
+			continue;
+		    } else {
+			src = origin + Tcl_UtfToUniChar(origin, &ch);
+		    }
+		}
+		dst += Tcl_UniCharToUtf(ch, dst);
+	    } else {
+		unsigned fullChar;
+
+		src += UtfToInt(src, &fullChar);
+		if (fullChar > 0xFFFF) {
+		    fullChar -= 0x10000;
+		    ch = (Tcl_UniChar) ((fullChar >> 10) + 0xD800);
+		    dst += Tcl_UniCharToUtf(ch, dst);
+		    ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
+		    dst += Tcl_UniCharToUtf(ch, dst);
+		} else {
+		    ch = (Tcl_UniChar) fullChar;
+		    dst += Tcl_UniCharToUtf(ch, dst);
+		}
+	    }
 	}
     }
 
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index f0d08e7..74a89c7 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -133,7 +133,7 @@ UtfCount(
 #endif
     return 3;
 }
-
+
 /*
  *---------------------------------------------------------------------------
  *
diff --git a/generic/tclZlib.c b/generic/tclZlib.c
index 544fb6e..7f18b9e 100644
--- a/generic/tclZlib.c
+++ b/generic/tclZlib.c
@@ -1613,8 +1613,7 @@ ZlibCmd(
     if (objc < 2) {
 	Tcl_WrongNumArgs(interp, 1, objv, "command arg ?...?");
 	return TCL_ERROR;
-    }
-    if (Tcl_GetIndexFromObj(interp, objv[1], commands, "command", 0,
+    } else if (Tcl_GetIndexFromObj(interp, objv[1], commands, "command", 0,
 	    &command) != TCL_OK) {
 	return TCL_ERROR;
     }
@@ -1972,8 +1971,7 @@ ZlibCmd(
 		    Tcl_SetErrorCode(interp, "TCL", "ZIP", "NOVAL", NULL);
 		    return TCL_ERROR;
 		}
-		if (Tcl_GetIntFromObj(interp, objv[i],
-			(int *) &limit) != TCL_OK) {
+		if (Tcl_GetIntFromObj(interp, objv[i], (int *) &limit) != TCL_OK) {
 		    Tcl_AddErrorInfo(interp, "\n    (in -limit option)");
 		    return TCL_ERROR;
 		}
@@ -1984,6 +1982,10 @@ ZlibCmd(
 	    }
 	}
 
+	/*
+	 * Actually do the push of the instance of the transform.
+	 */
+
 	if (ZlibStackChannelTransform(interp, mode, format, level, chan,
 		headerObj) == NULL) {
 	    return TCL_ERROR;
@@ -2045,16 +2047,22 @@ ZlibStreamCmd(
 	ao_buffer, ao_finalize, ao_flush, ao_fullflush
     };
 
+    /*
+     * Basic syntax checks.
+     */
+
     if (objc < 2) {
 	Tcl_WrongNumArgs(interp, 1, objv, "option data ?...?");
 	return TCL_ERROR;
-    }
-
-    if (Tcl_GetIndexFromObj(interp, objv[1], cmds, "option", 0,
+    } else if (Tcl_GetIndexFromObj(interp, objv[1], cmds, "option", 0,
 	    &command) != TCL_OK) {
 	return TCL_ERROR;
     }
 
+    /*
+     * Execute a relevant subcommand.
+     */
+
     switch ((enum zlibStreamCommands) command) {
     case zs_add:		/* $strm add ?$flushopt? $data */
 	for (i=2; i<objc-1; i++) {
@@ -2104,6 +2112,11 @@ ZlibStreamCmd(
 			    NULL);
 		    return TCL_ERROR;
 		}
+		if (buffersize < 1 || buffersize > 65536) {
+		    Tcl_AppendResult(interp, "buffer size must be between "
+			    "1 byte and 64 kibibytes", NULL);
+		    return TCL_ERROR;
+		}
 	    }
 
 	    if (flush == -2) {
diff --git a/tests/encoding.test b/tests/encoding.test
index 0374e2d..9219a69 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -328,6 +328,15 @@ test encoding-15.3 {UtfToUtfProc null character input} {
     list [string bytelength $x] [string bytelength $y] $z
 } {1 2 c080}
 
+test encoding-15.4 {UtfToUtfProc: UTF-8 to UTF-16 and back} {
+    set x \xF0\xA4\xAD\xA2; # U+024B62
+    set y [encoding convertfrom utf-8 $x]
+    set z [encoding convertto utf-8 $y]
+    list [string length $x] [string length $y] [string length $z] \
+	[format 0x%04x.0x%04x {*}[scan $y %c%c]] \
+	[format %02x.%02x.%02x.%02x {*}[scan $z %c%c%c%c]]
+} {4 2 4 0xd852.0xdf62 f0.a4.ad.a2}
+
 test encoding-16.1 {UnicodeToUtfProc} {
     set val [encoding convertfrom unicode NN]
     list $val [format %x [scan $val %c]]