From a7ec180fc75e299b71f6d839da636eff3528a713 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 31 May 2019 23:19:07 +0000 Subject: TIP #547 implementation: New encodings: UTF-16, UCS-2 --- generic/tclEncoding.c | 226 +++++++++++++++++++++++++++++++++++++++++--------- tests/binary.test | 2 +- tests/chanio.test | 4 +- tests/encoding.test | 23 +++-- tests/io.test | 4 +- tests/ioCmd.test | 8 +- tests/source.test | 4 +- 7 files changed, 213 insertions(+), 58 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 7eb73e8..a88c1a7 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -234,12 +234,17 @@ static int TableToUtfProc(ClientData clientData, const char *src, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); static size_t unilen(const char *src); -static int UniCharToUtfProc(ClientData clientData, +static int Utf16ToUtfProc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); -static int UtfToUniCharProc(ClientData clientData, +static int UtfToUtf16Proc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int UtfToUcs2Proc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, @@ -564,11 +569,16 @@ TclInitEncodingSubsystem(void) TableEncodingData *dataPtr; unsigned size; unsigned short i; + union { + char c; + short s; + } isLe; if (encodingsInitialized) { return; } + isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); @@ -595,13 +605,38 @@ TclInitEncodingSubsystem(void) type.clientData = NULL; Tcl_CreateEncoding(&type); - type.encodingName = "unicode"; - type.toUtfProc = UniCharToUtfProc; - type.fromUtfProc = UtfToUniCharProc; + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUcs2Proc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = NULL; + type.encodingName = "ucs-2le"; + type.clientData = INT2PTR(1); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2"; + type.clientData = INT2PTR(isLe.c); + Tcl_CreateEncoding(&type); + + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUtf16Proc; + type.freeProc = NULL; + type.nullSize = 2; + type.encodingName = "utf-16le"; + type.clientData = INT2PTR(1);; + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16"; + type.clientData = INT2PTR(isLe.c);; + Tcl_CreateEncoding(&type); + +#ifndef TCL_NO_DEPRECATED + type.encodingName = "unicode"; Tcl_CreateEncoding(&type); +#endif /* * Need the iso8859-1 encoding in order to process binary data, so force @@ -1279,7 +1314,7 @@ Tcl_ExternalToUtf( if (*dstCharsPtr <= maxChars) { break; } - dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX; + dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); flags = savedFlags; *statePtr = savedState; } while (1); @@ -2401,9 +2436,9 @@ UtfToUtfProc( /* *------------------------------------------------------------------------- * - * UniCharToUtfProc -- + * Utf16ToUtfProc -- * - * Convert from Unicode to UTF-8. + * Convert from UTF-16 to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2415,8 +2450,8 @@ UtfToUtfProc( */ static int -UniCharToUtfProc( - ClientData clientData, /* Not used. */ +Utf16ToUtfProc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2468,12 +2503,15 @@ UniCharToUtfProc( break; } + if (clientData) { + ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); + } /* * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ - - ch = *(unsigned short *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { @@ -2491,9 +2529,9 @@ UniCharToUtfProc( /* *------------------------------------------------------------------------- * - * UtfToUniCharProc -- + * UtfToUtf16Proc -- * - * Convert from UTF-8 to Unicode. + * Convert from UTF-8 to UTF-16. * * Results: * Returns TCL_OK if conversion was successful. @@ -2505,9 +2543,8 @@ UniCharToUtfProc( */ static int -UtfToUniCharProc( - ClientData clientData, /* TableEncodingData that specifies - * encoding. */ +UtfToUtf16Proc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2571,44 +2608,151 @@ UtfToUniCharProc( * casting dst to a Tcl_UniChar. [Bug 1122671] */ -#ifdef WORDS_BIGENDIAN + if (clientData) { #if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); - } else { - *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; - *dst++ = (*chPtr & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - } -#else - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); -#endif + if (*chPtr <= 0xFFFF) { + *dst++ = (*chPtr & 0xFF); + *dst++ = (*chPtr >> 8); + } else { + *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); + *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (*chPtr & 0xFF); + *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; + } #else -#if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { *dst++ = (*chPtr & 0xFF); *dst++ = (*chPtr >> 8); +#endif } else { - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; +#if TCL_UTF_MAX > 4 + if (*chPtr <= 0xFFFF) { + *dst++ = (*chPtr >> 8); + *dst++ = (*chPtr & 0xFF); + } else { + *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; + *dst++ = (*chPtr & 0xFF); + *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); + } +#else + *dst++ = (*chPtr >> 8); *dst++ = (*chPtr & 0xFF); - *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; +#endif } -#else - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * + * UtfToUcs2Proc -- + * + * Convert from UTF-8 to UCS-2. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +UtfToUcs2Proc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars; +#if TCL_UTF_MAX <= 4 + int len; #endif + Tcl_UniChar ch = 0; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } +#if TCL_UTF_MAX <= 4 + src += (len = TclUtfToUniChar(src, &ch)); + if ((ch >= 0xD800) && (len < 3)) { + src += TclUtfToUniChar(src, &ch); + ch = 0xFFFD; + } +#else + src += TclUtfToUniChar(src, &ch); + if (ch > 0xFFFF) { + ch = 0xFFFD; + } #endif + + /* + * Need to handle this in a way that won't cause misalignment by + * casting dst to a Tcl_UniChar. [Bug 1122671] + */ + + if (clientData) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); + } else { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); + } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; return result; } - + /* *------------------------------------------------------------------------- * diff --git a/tests/binary.test b/tests/binary.test index aede659..973240f 100644 --- a/tests/binary.test +++ b/tests/binary.test @@ -2912,7 +2912,7 @@ test binary-77.2 {string cat ops on all bytearrays} { } [binary format H* abcd] test binary-78.1 {unicode (out of BMP) to byte-array conversion, bug-[bd94500678]} -body { - # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX <= 4): + # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX == 3): binary encode hex \U0001f415 binary scan \U0001f415 a* v; set v set str {} diff --git a/tests/chanio.test b/tests/chanio.test index 9dc9e7c..1439fe4 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -888,7 +888,7 @@ test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} -setup # Tcl_ExternalToUtf() set f [openpipe w+ $path(cat)] chan configure $f -translation {auto lf} -buffering none - chan configure $f -encoding unicode + chan configure $f -encoding utf-16 chan puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r" chan configure $f -buffersize 16 chan gets $f @@ -1129,7 +1129,7 @@ test chan-io-8.2 {PeekAhead: only go to device if no more cached data} -setup { chan event $f read [namespace code { lappend x [chan gets $f line] $line [testchannel inputbuffered $f] }] - chan configure $f -encoding unicode -buffersize 16 -blocking 0 + chan configure $f -encoding utf-16 -buffersize 16 -blocking 0 vwait [namespace which -variable x] chan configure $f -translation auto -encoding ascii -blocking 1 # here diff --git a/tests/encoding.test b/tests/encoding.test index 4736928..da34f03 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -322,18 +322,29 @@ test encoding-15.3 {UtfToUtfProc null character input} teststringbytes { set z } c080 -test encoding-16.1 {UnicodeToUtfProc} -body { - set val [encoding convertfrom unicode NN] +test encoding-16.1 {Utf16ToUtfProc} -body { + set val [encoding convertfrom utf-16 NN] list $val [format %x [scan $val %c]] } -result "\u4e4e 4e4e" -test encoding-16.2 {UnicodeToUtfProc} -body { - set val [encoding convertfrom unicode "\xd8\xd8\xdc\xdc"] +test encoding-16.2 {Utf16ToUtfProc} -body { + set val [encoding convertfrom utf-16 "\xd8\xd8\xdc\xdc"] + list $val [format %x [scan $val %c]] +} -result "\U460dc 460dc" +test encoding-16.3 {Ucs2ToUtfProc} -body { + set val [encoding convertfrom ucs-2 NN] + list $val [format %x [scan $val %c]] +} -result "\u4e4e 4e4e" +test encoding-16.4 {Ucs2ToUtfProc} -body { + set val [encoding convertfrom ucs-2 "\xd8\xd8\xdc\xdc"] list $val [format %x [scan $val %c]] } -result "\U460dc 460dc" -test encoding-17.1 {UtfToUnicodeProc} -body { - encoding convertto unicode "\U460dc" +test encoding-17.1 {UtfToUtf16Proc} -body { + encoding convertto utf-16 "\U460dc" } -result "\xd8\xd8\xdc\xdc" +test encoding-17.2 {UtfToUcs2Proc} -body { + encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460dc"] +} -result "\ufffd" test encoding-18.1 {TableToUtfProc} { } {} diff --git a/tests/io.test b/tests/io.test index 6470282..39deab6 100644 --- a/tests/io.test +++ b/tests/io.test @@ -918,7 +918,7 @@ test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio test set f [open "|[list [interpreter] $path(cat)]" w+] fconfigure $f -translation {auto lf} -buffering none - fconfigure $f -encoding unicode + fconfigure $f -encoding utf-16 puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r" fconfigure $f -buffersize 16 gets $f @@ -1162,7 +1162,7 @@ test io-8.2 {PeekAhead: only go to device if no more cached data} {stdio testcha variable x lappend x [gets $f line] $line [testchannel inputbuffered $f] } - fconfigure $f -encoding unicode -buffersize 16 -blocking 0 + fconfigure $f -encoding utf-16 -buffersize 16 -blocking 0 vwait [namespace which -variable x] fconfigure $f -translation auto -encoding ascii -blocking 1 # here diff --git a/tests/ioCmd.test b/tests/ioCmd.test index a967139..87ad4af 100644 --- a/tests/ioCmd.test +++ b/tests/ioCmd.test @@ -241,23 +241,23 @@ test iocmd-8.7 {fconfigure command} -setup { file delete $path(test1) } -body { set f1 [open $path(test1) w] - fconfigure $f1 -translation lf -eofchar {} -encoding unicode + fconfigure $f1 -translation lf -eofchar {} -encoding utf-16 fconfigure $f1 } -cleanup { catch {close $f1} -} -result {-blocking 1 -buffering full -buffersize 4096 -encoding unicode -eofchar {} -translation lf} +} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -eofchar {} -translation lf} test iocmd-8.8 {fconfigure command} -setup { file delete $path(test1) set x {} } -body { set f1 [open $path(test1) w] fconfigure $f1 -translation lf -buffering line -buffersize 3030 \ - -eofchar {} -encoding unicode + -eofchar {} -encoding utf-16 lappend x [fconfigure $f1 -buffering] lappend x [fconfigure $f1] } -cleanup { catch {close $f1} -} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding unicode -eofchar {} -translation lf}} +} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -eofchar {} -translation lf}} test iocmd-8.9 {fconfigure command} -setup { file delete $path(test1) } -body { diff --git a/tests/source.test b/tests/source.test index 8b146d3..c6cccd6 100644 --- a/tests/source.test +++ b/tests/source.test @@ -240,12 +240,12 @@ test source-7.2 {source -encoding test} -setup { set sourcefile [makeFile {} source.file] file delete $sourcefile set f [open $sourcefile w] - fconfigure $f -encoding unicode + fconfigure $f -encoding utf-16 puts $f "set symbol(square-root) \u221A; set x correct" close $f } -body { set x unset - source -encoding unicode $sourcefile + source -encoding utf-16 $sourcefile set x } -cleanup { removeFile source.file -- cgit v0.12