diff options
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | generic/tclEncoding.c | 210 | ||||
-rw-r--r-- | generic/tclUtf.c | 2 | ||||
-rw-r--r-- | generic/tclZlib.c | 27 | ||||
-rw-r--r-- | tests/encoding.test | 9 |
5 files changed, 237 insertions, 17 deletions
@@ -1552,6 +1552,12 @@ * library/tzdata/America/Kralendijk: (new) * library/tzdata/America/Lower_Princes: (new) +2011-07-27 Donal K. Fellows <dkf@users.sf.net> + + * generic/tclEncoding.c (UtfToUtfProc): Start to rough out what needs + to change to transition Tcl to being able to work with non-BMP + characters, at least at a basic level. + 2011-07-26 Donal K. Fellows <dkf@users.sf.net> * generic/tclOO.c (initScript): Ensure that TclOO is properly found by diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 7a55724..4871b85 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -191,6 +191,14 @@ static Tcl_Encoding systemEncoding; static unsigned short emptyPage[256]; /* + * Constants used in the (external) UTF-8 <--> (internal) Modified UTF-8 + * conversion code. + */ + +#define FROM_STANDARD_UTF8 0 +#define TO_STANDARD_UTF8 1 + +/* * Functions used only in this module. */ @@ -2160,7 +2168,7 @@ UtfIntToUtfExtProc( * output buffer. */ { return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 1); + srcReadPtr, dstWrotePtr, dstCharsPtr, TO_STANDARD_UTF8); } /* @@ -2209,7 +2217,7 @@ UtfExtToUtfIntProc( * output buffer. */ { return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 0); + srcReadPtr, dstWrotePtr, dstCharsPtr, FROM_STANDARD_UTF8); } /* @@ -2230,6 +2238,153 @@ UtfExtToUtfIntProc( *------------------------------------------------------------------------- */ +static INLINE int +IntToUtf( + unsigned ch, /* The character to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the character is stored. Buffer must be + * large enough to hold the UTF-8 character + * (at most 6 bytes). */ +{ + if ((ch > 0) && (ch < 0x80)) { + buf[0] = (char) ch; + return 1; + } + if (ch <= 0x7FF) { + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 6) | 0xC0); + return 2; + } + if (ch <= 0xFFFF) { + three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + if (ch <= 0x1FFFFF) { + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 18) | 0xF0); + return 4; + } + if (ch <= 0x3FFFFFF) { + buf[4] = (char) ((ch | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + buf[5] = (char) ((ch | 0x80) & 0xBF); + buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 30) | 0xFC); + return 6; + } + + ch = 0xFFFD; + goto three; +} + +static INLINE int +UtfToInt( + const char *src, /* The UTF-8 string. */ + unsigned *chPtr) /* Filled with the character represented by + * the front of the UTF-8 string. */ +{ + register int byte; + + /* + * Unroll 1 to 6 byte UTF-8 sequences, use loop to handle longer ones. + */ + + byte = *((unsigned char *) src); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * characters representing themselves. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xE0) { + if ((src[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + return 2; + } + + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xF0) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + return 3; + } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else { + int ch, total, trail; + static const unsigned char totalBytes[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 + }; + + total = totalBytes[byte]; + trail = total - 1; + if (trail > 0) { + ch = byte & (0x3F >> trail); + do { + src++; + if ((*src & 0xC0) != 0x80) { + *chPtr = byte; + return 1; + } + ch <<= 6; + ch |= (*src & 0x3F); + trail--; + } while (trail > 0); + *chPtr = ch; + return total; + } else { + *chPtr = (Tcl_UniChar) byte; + return 1; + } + } +} + static int UtfToUtfProc( ClientData clientData, /* Not used. */ @@ -2256,7 +2411,7 @@ UtfToUtfProc( int *dstCharsPtr, /* Filled with the number of characters that * correspond to the bytes stored in the * output buffer. */ - int pureNullMode) /* Convert embedded nulls from internal + int conversionMode) /* Convert embedded nulls from internal * representation to real null-bytes or vice * versa. */ { @@ -2291,15 +2446,16 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) { + if (UCHAR(*src) < 0x80 && + !(UCHAR(*src) == 0 && conversionMode == FROM_STANDARD_UTF8)) { /* * Copy 7bit chatacters, but skip null-bytes when we are in input * mode, so that they get converted to 0xc080. */ *dst++ = *src++; - } else if (pureNullMode == 1 && UCHAR(*src) == 0xc0 && - UCHAR(*(src+1)) == 0x80) { + } else if (conversionMode == TO_STANDARD_UTF8 && UCHAR(*src) == 0xc0 + && UCHAR(*(src+1)) == 0x80) { /* * Convert 0xc080 to real nulls when we are in output mode. */ @@ -2310,15 +2466,51 @@ UtfToUtfProc( /* * Always check before using Tcl_UtfToUniChar. Not doing can so * cause it run beyond the endof the buffer! If we happen such an - * incomplete char its byts are made to represent themselves. + * incomplete char, its bytes are made to represent themselves. */ ch = (unsigned char) *src; src += 1; dst += Tcl_UniCharToUtf(ch, dst); } else { - src += Tcl_UtfToUniChar(src, &ch); - dst += Tcl_UniCharToUtf(ch, dst); + /* + * This is where we ought to do surrogate pair handling, with the + * correct way of doing it depending on the conversionMode + * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE! + */ + + if (conversionMode == TO_STANDARD_UTF8) { + const char *origin = src; + + src += Tcl_UtfToUniChar(src, &ch); + if (ch >= 0xD800 && ch < 0xDBFF) { + unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10; + + src += Tcl_UtfToUniChar(src, &ch); + if (ch >= 0xDC00 && ch < 0xDFFF) { + fullChar += 0x2400 + (unsigned) ch; + dst += IntToUtf(fullChar, dst); + continue; + } else { + src = origin + Tcl_UtfToUniChar(origin, &ch); + } + } + dst += Tcl_UniCharToUtf(ch, dst); + } else { + unsigned fullChar; + + src += UtfToInt(src, &fullChar); + if (fullChar > 0xFFFF) { + fullChar -= 0x10000; + ch = (Tcl_UniChar) ((fullChar >> 10) + 0xD800); + dst += Tcl_UniCharToUtf(ch, dst); + ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00); + dst += Tcl_UniCharToUtf(ch, dst); + } else { + ch = (Tcl_UniChar) fullChar; + dst += Tcl_UniCharToUtf(ch, dst); + } + } } } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index f0d08e7..74a89c7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -133,7 +133,7 @@ UtfCount( #endif return 3; } - + /* *--------------------------------------------------------------------------- * diff --git a/generic/tclZlib.c b/generic/tclZlib.c index 544fb6e..7f18b9e 100644 --- a/generic/tclZlib.c +++ b/generic/tclZlib.c @@ -1613,8 +1613,7 @@ ZlibCmd( if (objc < 2) { Tcl_WrongNumArgs(interp, 1, objv, "command arg ?...?"); return TCL_ERROR; - } - if (Tcl_GetIndexFromObj(interp, objv[1], commands, "command", 0, + } else if (Tcl_GetIndexFromObj(interp, objv[1], commands, "command", 0, &command) != TCL_OK) { return TCL_ERROR; } @@ -1972,8 +1971,7 @@ ZlibCmd( Tcl_SetErrorCode(interp, "TCL", "ZIP", "NOVAL", NULL); return TCL_ERROR; } - if (Tcl_GetIntFromObj(interp, objv[i], - (int *) &limit) != TCL_OK) { + if (Tcl_GetIntFromObj(interp, objv[i], (int *) &limit) != TCL_OK) { Tcl_AddErrorInfo(interp, "\n (in -limit option)"); return TCL_ERROR; } @@ -1984,6 +1982,10 @@ ZlibCmd( } } + /* + * Actually do the push of the instance of the transform. + */ + if (ZlibStackChannelTransform(interp, mode, format, level, chan, headerObj) == NULL) { return TCL_ERROR; @@ -2045,16 +2047,22 @@ ZlibStreamCmd( ao_buffer, ao_finalize, ao_flush, ao_fullflush }; + /* + * Basic syntax checks. + */ + if (objc < 2) { Tcl_WrongNumArgs(interp, 1, objv, "option data ?...?"); return TCL_ERROR; - } - - if (Tcl_GetIndexFromObj(interp, objv[1], cmds, "option", 0, + } else if (Tcl_GetIndexFromObj(interp, objv[1], cmds, "option", 0, &command) != TCL_OK) { return TCL_ERROR; } + /* + * Execute a relevant subcommand. + */ + switch ((enum zlibStreamCommands) command) { case zs_add: /* $strm add ?$flushopt? $data */ for (i=2; i<objc-1; i++) { @@ -2104,6 +2112,11 @@ ZlibStreamCmd( NULL); return TCL_ERROR; } + if (buffersize < 1 || buffersize > 65536) { + Tcl_AppendResult(interp, "buffer size must be between " + "1 byte and 64 kibibytes", NULL); + return TCL_ERROR; + } } if (flush == -2) { diff --git a/tests/encoding.test b/tests/encoding.test index 0374e2d..9219a69 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -328,6 +328,15 @@ test encoding-15.3 {UtfToUtfProc null character input} { list [string bytelength $x] [string bytelength $y] $z } {1 2 c080} +test encoding-15.4 {UtfToUtfProc: UTF-8 to UTF-16 and back} { + set x \xF0\xA4\xAD\xA2; # U+024B62 + set y [encoding convertfrom utf-8 $x] + set z [encoding convertto utf-8 $y] + list [string length $x] [string length $y] [string length $z] \ + [format 0x%04x.0x%04x {*}[scan $y %c%c]] \ + [format %02x.%02x.%02x.%02x {*}[scan $z %c%c%c%c]] +} {4 2 4 0xd852.0xdf62 f0.a4.ad.a2} + test encoding-16.1 {UnicodeToUtfProc} { set val [encoding convertfrom unicode NN] list $val [format %x [scan $val %c]] |