4 files changed, 52 insertions, 29 deletions
diff --git a/ChangeLog b/ChangeLog
index 2d70cc1..5331add 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2006-10-05  Jeff Hobbs  <jeffh@ActiveState.com>
+
+	* generic/tcl.h: note limitation on changing Tcl_UniChar size
+	* generic/tclEncoding.c (UtfToUnicodeProc, UnicodeToUtfProc): 
+	* tests/encoding.test (encoding-16.1): fix alignment issues in
+	unicode <> utf conversion procs. [Bug 1122671]
+
 2006-10-05  Miguel Sofer  <msofer@users.sf.net>
 
 	* generic/tclVar.c (Tcl_LappendObjCmd):
diff --git a/generic/tcl.h b/generic/tcl.h
index d7bdc90..696076b 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -13,7 +13,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tcl.h,v 1.153.2.28 2006/09/26 21:40:36 patthoyts Exp $
+ * RCS: @(#) $Id: tcl.h,v 1.153.2.29 2006/10/05 21:24:56 hobbs Exp $
  */
 
 #ifndef _TCL
@@ -2238,7 +2238,11 @@ typedef struct Tcl_Parse {
     /*
      * unsigned int isn't 100% accurate as it should be a strict 4-byte
      * value (perhaps wchar_t).  64-bit systems may have troubles.  The
-     * size of this value must be reflected correctly in regcustom.h.
+     * size of this value must be reflected correctly in regcustom.h and
+     * in tclEncoding.c.
+     * XXX: Tcl is currently UCS-2 and planning UTF-16 for the Unicode
+     * XXX: string rep that Tcl_UniChar represents.  Changing the size
+     * XXX: of Tcl_UniChar is /not/ supported.
      */
 typedef unsigned int Tcl_UniChar;
 #else
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 5b2ae19..70c9eb6 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.11 2006/08/09 18:12:42 dgp Exp $
+ * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.12 2006/10/05 21:24:56 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -2147,10 +2147,11 @@ UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 				 * correspond to the bytes stored in the
 				 * output buffer. */
 {
-    CONST Tcl_UniChar *wSrc, *wSrcStart, *wSrcEnd;
+    CONST char *srcStart, *srcEnd;
     char *dstEnd, *dstStart;
     int result, numChars;
-    
+    Tcl_UniChar ch;
+
     result = TCL_OK;
     if ((srcLen % sizeof(Tcl_UniChar)) != 0) {
 	result = TCL_CONVERT_MULTIBYTE;
@@ -2158,31 +2159,31 @@ UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 	srcLen *= sizeof(Tcl_UniChar);
     }
 
-    wSrc = (Tcl_UniChar *) src;
-
-    wSrcStart = (Tcl_UniChar *) src;
-    wSrcEnd = (Tcl_UniChar *) (src + srcLen);
+    srcStart = src;
+    srcEnd = src + srcLen;
 
     dstStart = dst;
     dstEnd = dst + dstLen - TCL_UTF_MAX;
 
-    for (numChars = 0; wSrc < wSrcEnd; numChars++) {
+    for (numChars = 0; src < srcEnd; numChars++) {
 	if (dst > dstEnd) {
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
 	/*
-	 * Special case for 1-byte utf chars for speed.
+	 * Special case for 1-byte utf chars for speed.  Make sure we
+	 * work with Tcl_UniChar-size data.
 	 */
-	if (*wSrc && *wSrc < 0x80) {
-	    *dst++ = (char) *wSrc;
+	ch = *(Tcl_UniChar *)src;
+	if (ch && ch < 0x80) {
+	    *dst++ = *src;
 	} else {
-	    dst += Tcl_UniCharToUtf(*wSrc, dst);
+	    dst += Tcl_UniCharToUtf(ch, dst);
 	}
-	wSrc++;
+	src += sizeof(Tcl_UniChar);
     }
 
-    *srcReadPtr = (char *) wSrc - (char *) wSrcStart;
+    *srcReadPtr = src - srcStart;
     *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
     return result;
@@ -2232,10 +2233,10 @@ UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 				 * correspond to the bytes stored in the
 				 * output buffer. */
 {
-    CONST char *srcStart, *srcEnd, *srcClose;
-    Tcl_UniChar *wDst, *wDstStart, *wDstEnd;
+    CONST char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
     int result, numChars;
-    
+    Tcl_UniChar ch;
+
     srcStart = src;
     srcEnd = src + srcLen;
     srcClose = srcEnd;
@@ -2243,9 +2244,8 @@ UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 	srcClose -= TCL_UTF_MAX;
     }
 
-    wDst = (Tcl_UniChar *) dst;
-    wDstStart = (Tcl_UniChar *) dst;
-    wDstEnd = (Tcl_UniChar *) (dst + dstLen - sizeof(Tcl_UniChar));
+    dstStart = dst;
+    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);
 
     result = TCL_OK;
     for (numChars = 0; src < srcEnd; numChars++) {
@@ -2258,15 +2258,26 @@ UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 	    result = TCL_CONVERT_MULTIBYTE;
 	    break;
 	}
-	if (wDst > wDstEnd) {
+	if (dst > dstEnd) {
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
         }
-	src += TclUtfToUniChar(src, wDst);
-	wDst++;
+	src += TclUtfToUniChar(src, &ch);
+	/*
+	 * Need to handle this in a way that won't cause misalignment
+	 * by casting dst to a Tcl_UniChar. [Bug 1122671]
+	 * XXX: This hard-codes the assumed size of Tcl_UniChar as 2.
+	 */
+#ifdef WORDS_BIGENDIAN
+	*dst++ = (ch >> 8);
+	*dst++ = (ch & 0xFF);
+#else
+	*dst++ = (ch & 0xFF);
+	*dst++ = (ch >> 8);
+#endif
     }
     *srcReadPtr = src - srcStart;
-    *dstWrotePtr = (char *) wDst - (char *) wDstStart;
+    *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
     return result;
 }
diff --git a/tests/encoding.test b/tests/encoding.test
index 83f5f69..4e5a7fa 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -8,7 +8,7 @@
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 #
-# RCS: @(#) $Id: encoding.test,v 1.16.2.2 2004/05/27 14:33:20 rmax Exp $
+# RCS: @(#) $Id: encoding.test,v 1.16.2.3 2006/10/05 21:24:56 hobbs Exp $
 
 package require tcltest 2
 namespace import -force ::tcltest::*
@@ -307,8 +307,9 @@ test encoding-15.3 {UtfToUtfProc null character input} {
 } {1 2 c080}
 
 test encoding-16.1 {UnicodeToUtfProc} {
-    encoding convertfrom unicode NN
-} "\u4e4e"
+    set val [encoding convertfrom unicode NN]
+    list $val [format %x [scan $val %c]]
+} "\u4e4e 4e4e"
 
 test encoding-17.1 {UtfToUnicodeProc} {
 } {}