diff options
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | generic/tcl.h | 8 | ||||
-rw-r--r-- | generic/tclEncoding.c | 59 | ||||
-rw-r--r-- | tests/encoding.test | 7 |
4 files changed, 52 insertions, 29 deletions
@@ -1,3 +1,10 @@ +2006-10-05 Jeff Hobbs <jeffh@ActiveState.com> + + * generic/tcl.h: note limitation on changing Tcl_UniChar size + * generic/tclEncoding.c (UtfToUnicodeProc, UnicodeToUtfProc): + * tests/encoding.test (encoding-16.1): fix alignment issues in + unicode <> utf conversion procs. [Bug 1122671] + 2006-10-05 Miguel Sofer <msofer@users.sf.net> * generic/tclVar.c (Tcl_LappendObjCmd): diff --git a/generic/tcl.h b/generic/tcl.h index d7bdc90..696076b 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tcl.h,v 1.153.2.28 2006/09/26 21:40:36 patthoyts Exp $ + * RCS: @(#) $Id: tcl.h,v 1.153.2.29 2006/10/05 21:24:56 hobbs Exp $ */ #ifndef _TCL @@ -2238,7 +2238,11 @@ typedef struct Tcl_Parse { /* * unsigned int isn't 100% accurate as it should be a strict 4-byte * value (perhaps wchar_t). 64-bit systems may have troubles. The - * size of this value must be reflected correctly in regcustom.h. + * size of this value must be reflected correctly in regcustom.h and + * in tclEncoding.c. + * XXX: Tcl is currently UCS-2 and planning UTF-16 for the Unicode + * XXX: string rep that Tcl_UniChar represents. Changing the size + * XXX: of Tcl_UniChar is /not/ supported. */ typedef unsigned int Tcl_UniChar; #else diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5b2ae19..70c9eb6 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.11 2006/08/09 18:12:42 dgp Exp $ + * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.12 2006/10/05 21:24:56 hobbs Exp $ */ #include "tclInt.h" @@ -2147,10 +2147,11 @@ UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, * correspond to the bytes stored in the * output buffer. */ { - CONST Tcl_UniChar *wSrc, *wSrcStart, *wSrcEnd; + CONST char *srcStart, *srcEnd; char *dstEnd, *dstStart; int result, numChars; - + Tcl_UniChar ch; + result = TCL_OK; if ((srcLen % sizeof(Tcl_UniChar)) != 0) { result = TCL_CONVERT_MULTIBYTE; @@ -2158,31 +2159,31 @@ UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, srcLen *= sizeof(Tcl_UniChar); } - wSrc = (Tcl_UniChar *) src; - - wSrcStart = (Tcl_UniChar *) src; - wSrcEnd = (Tcl_UniChar *) (src + srcLen); + srcStart = src; + srcEnd = src + srcLen; dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; wSrc < wSrcEnd; numChars++) { + for (numChars = 0; src < srcEnd; numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } /* - * Special case for 1-byte utf chars for speed. + * Special case for 1-byte utf chars for speed. Make sure we + * work with Tcl_UniChar-size data. */ - if (*wSrc && *wSrc < 0x80) { - *dst++ = (char) *wSrc; + ch = *(Tcl_UniChar *)src; + if (ch && ch < 0x80) { + *dst++ = *src; } else { - dst += Tcl_UniCharToUtf(*wSrc, dst); + dst += Tcl_UniCharToUtf(ch, dst); } - wSrc++; + src += sizeof(Tcl_UniChar); } - *srcReadPtr = (char *) wSrc - (char *) wSrcStart; + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; return result; @@ -2232,10 +2233,10 @@ UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, * correspond to the bytes stored in the * output buffer. */ { - CONST char *srcStart, *srcEnd, *srcClose; - Tcl_UniChar *wDst, *wDstStart, *wDstEnd; + CONST char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - + Tcl_UniChar ch; + srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -2243,9 +2244,8 @@ UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, srcClose -= TCL_UTF_MAX; } - wDst = (Tcl_UniChar *) dst; - wDstStart = (Tcl_UniChar *) dst; - wDstEnd = (Tcl_UniChar *) (dst + dstLen - sizeof(Tcl_UniChar)); + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); result = TCL_OK; for (numChars = 0; src < srcEnd; numChars++) { @@ -2258,15 +2258,26 @@ UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, result = TCL_CONVERT_MULTIBYTE; break; } - if (wDst > wDstEnd) { + if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUniChar(src, wDst); - wDst++; + src += TclUtfToUniChar(src, &ch); + /* + * Need to handle this in a way that won't cause misalignment + * by casting dst to a Tcl_UniChar. [Bug 1122671] + * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. + */ +#ifdef WORDS_BIGENDIAN + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); +#else + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); +#endif } *srcReadPtr = src - srcStart; - *dstWrotePtr = (char *) wDst - (char *) wDstStart; + *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; return result; } diff --git a/tests/encoding.test b/tests/encoding.test index 83f5f69..4e5a7fa 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -8,7 +8,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: encoding.test,v 1.16.2.2 2004/05/27 14:33:20 rmax Exp $ +# RCS: @(#) $Id: encoding.test,v 1.16.2.3 2006/10/05 21:24:56 hobbs Exp $ package require tcltest 2 namespace import -force ::tcltest::* @@ -307,8 +307,9 @@ test encoding-15.3 {UtfToUtfProc null character input} { } {1 2 c080} test encoding-16.1 {UnicodeToUtfProc} { - encoding convertfrom unicode NN -} "\u4e4e" + set val [encoding convertfrom unicode NN] + list $val [format %x [scan $val %c]] +} "\u4e4e 4e4e" test encoding-17.1 {UtfToUnicodeProc} { } {} |