From be9f231fdaea85177d206ea98320c888b73f00d8 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 13 Oct 2021 13:33:43 +0000 Subject: First shot at [cef426ff2c]: Encoding UTF-32 missing --- generic/tclEncoding.c | 249 +++++++++++++++++++++++++++++++++++++++++++++++--- tests/encoding.test | 24 ++++- 2 files changed, 258 insertions(+), 15 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 61a931d..4166e45 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -45,7 +45,9 @@ typedef struct { * If nullSize is 1, this is strlen; if * nullSize is 2, this is a function that * returns the number of bytes in a 0x0000 - * terminated string. */ + * terminated string; if nullSize is 4, this + * is a function that returns the number of bytes + * in a 0x00000000 terminated string. */ size_t refCount; /* Number of uses of this structure. */ Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */ } Encoding; @@ -216,7 +218,10 @@ static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, static Tcl_EncodingFreeProc TableFreeProc; static Tcl_EncodingConvertProc TableFromUtfProc; static Tcl_EncodingConvertProc TableToUtfProc; +static size_t char16len(const char *src); static size_t unilen(const char *src); +static Tcl_EncodingConvertProc Utf32ToUtfProc; +static Tcl_EncodingConvertProc UtfToUtf32Proc; static Tcl_EncodingConvertProc Utf16ToUtfProc; static Tcl_EncodingConvertProc UtfToUtf16Proc; static Tcl_EncodingConvertProc UtfToUcs2Proc; @@ -577,6 +582,20 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); + type.toUtfProc = Utf32ToUtfProc; + type.fromUtfProc = UtfToUtf32Proc; + type.freeProc = NULL; + type.nullSize = 4; + type.encodingName = "utf-32le"; + type.clientData = INT2PTR(TCL_ENCODING_LE); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-32be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-32"; + type.clientData = INT2PTR(isLe.c); + Tcl_CreateEncoding(&type); + type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUtf16Proc; type.freeProc = NULL; @@ -1057,10 +1076,12 @@ Tcl_CreateEncoding( encodingPtr->freeProc = typePtr->freeProc; encodingPtr->nullSize = typePtr->nullSize; encodingPtr->clientData = typePtr->clientData; - if (typePtr->nullSize == 1) { - encodingPtr->lengthProc = (LengthProc *) strlen; - } else { + if (typePtr->nullSize == 2) { + encodingPtr->lengthProc = (LengthProc *) char16len; + } else if (typePtr->nullSize == 4) { encodingPtr->lengthProc = (LengthProc *) unilen; + } else { + encodingPtr->lengthProc = (LengthProc *) strlen; } encodingPtr->refCount = 1; encodingPtr->hPtr = NULL; @@ -1343,10 +1364,10 @@ Tcl_UtfToExternalDString( src += srcRead; if (result != TCL_CONVERT_NOSPACE) { - if (encodingPtr->nullSize == 2) { - Tcl_DStringSetLength(dstPtr, soFar + 1); + int i = soFar + encodingPtr->nullSize - 1; + while (i >= soFar) { + Tcl_DStringSetLength(dstPtr, i--); } - Tcl_DStringSetLength(dstPtr, soFar); return Tcl_DStringValue(dstPtr); } @@ -1441,10 +1462,7 @@ Tcl_UtfToExternal( result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); - if (encodingPtr->nullSize == 2) { - dst[*dstWrotePtr + 1] = '\0'; - } - dst[*dstWrotePtr] = '\0'; + memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize); return result; } @@ -2335,6 +2353,198 @@ UtfToUtfProc( *dstCharsPtr = numChars; return result; } + +/* + *------------------------------------------------------------------------- + * + * Utf32ToUtfProc -- + * + * Convert from UTF-32 to UTF-8. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +Utf32ToUtfProc( + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ + const char *src, /* Source string in Unicode. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + TCL_UNUSED(Tcl_EncodingState *), + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd; + const char *dstEnd, *dstStart; + int result, numChars, charLimit = INT_MAX; + unsigned short ch; + + flags |= PTR2INT(clientData); + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } + result = TCL_OK; + + /* + * Check alignment with utf-32 (4 == sizeof(UTF-32)) + */ + + if ((srcLen % 4) != 0) { + result = TCL_CONVERT_MULTIBYTE; + srcLen &= -4; + } + + srcStart = src; + srcEnd = src + srcLen; + + dstStart = dst; + dstEnd = dst + dstLen - TCL_UTF_MAX; + + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + + if (flags & TCL_ENCODING_LE) { + ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); + } + + /* + * Special case for 1-byte utf chars for speed. Make sure we work with + * unsigned short-size data. + */ + + if (ch && ch < 0x80) { + *dst++ = (ch & 0xFF); + } else { + dst += Tcl_UniCharToUtf(ch, dst); + } + src += sizeof(unsigned int); + } + + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * + * UtfToUtf32Proc -- + * + * Convert from UTF-8 to UTF-32. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +UtfToUtf32Proc( + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + TCL_UNUSED(Tcl_EncodingState *), + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars; + int ch, len; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + flags |= PTR2INT(clientData); + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + len = TclUtfToUCS4(src, &ch); + if (!Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = 0xFFFD; + } + src += len; + if (flags & TCL_ENCODING_LE) { + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xff); + *dst++ = ((ch >> 16) & 0xff); + *dst++ = ((ch >> 24) & 0xff); + } else { + *dst++ = ((ch >> 24) & 0xff); + *dst++ = ((ch >> 16) & 0xff); + *dst++ = ((ch >> 8) & 0xff); + *dst++ = (ch & 0xFF); + } + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} /* *------------------------------------------------------------------------- @@ -3628,7 +3838,7 @@ GetTableEncoding( /* *--------------------------------------------------------------------------- * - * unilen -- + * unilen/char16len -- * * A helper function for the Tcl_ExternalToUtf functions. This function * is similar to strlen for double-byte characters: it returns the number @@ -3644,7 +3854,7 @@ GetTableEncoding( */ static size_t -unilen( +char16len( const char *src) { unsigned short *p; @@ -3655,6 +3865,19 @@ unilen( } return (char *) p - src; } + +static size_t +unilen( + const char *src) +{ + unsigned int *p; + + p = (unsigned int *) src; + while (*p != 0x0000) { + p++; + } + return (char *) p - src; +} /* *------------------------------------------------------------------------- diff --git a/tests/encoding.test b/tests/encoding.test index 25d0827..c6f4e02 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -287,6 +287,12 @@ test encoding-11.8 {encoding: extended Unicode UTF-16} { test encoding-11.9 {encoding: extended Unicode UTF-16} { viewable [encoding convertto utf-16be 😹] } {Ø=Þ9 (\u00D8=\u00DE9)} +test encoding-11.10 {encoding: extended Unicode UTF-32} { + viewable [encoding convertto utf-32le 😹] +} "9\xF6\x01\x00 (9\\u00F6\\u0001\\u0000)" +test encoding-11.11 {encoding: extended Unicode UTF-32} { + viewable [encoding convertto utf-32be 😹] +} "\x00\x01\xF69 (\\u0000\\u0001\\u00F69)" # OpenEncodingFile is fully tested by the rest of the tests in this file. test encoding-12.1 {LoadTableEncoding: normal encoding} { @@ -461,10 +467,18 @@ test encoding-16.4 {Ucs2ToUtfProc} -body { set val [encoding convertfrom ucs-2 NN] list $val [format %x [scan $val %c]] } -result "乎 4e4e" -test encoding-16.4 {Ucs2ToUtfProc} -body { +test encoding-16.5 {Ucs2ToUtfProc} -body { set val [encoding convertfrom ucs-2 "\xD8\xD8\xDC\xDC"] list $val [format %x [scan $val %c]] } -result "\U460DC 460dc" +test encoding-16.6 {Utf32ToUtfProc} -body { + set val [encoding convertfrom utf-32le NN\0\0] + list $val [format %x [scan $val %c]] +} -result "乎 4e4e" +test encoding-16.7 {Utf32ToUtfProc} -body { + set val [encoding convertfrom utf-32be \0\0NN] + list $val [format %x [scan $val %c]] +} -result "乎 4e4e" test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" @@ -478,6 +492,12 @@ test encoding-17.3 {UtfToUtf16Proc} -body { test encoding-17.4 {UtfToUtf16Proc} -body { encoding convertto utf-16le "\uD8D8" } -result "\xFD\xFF" +test encoding-17.5 {UtfToUtf16Proc} -body { + encoding convertto utf-32le "\U460DC" +} -result "\xDC\x60\x04\x00" +test encoding-17.6 {UtfToUtf16Proc} -body { + encoding convertto utf-32be "\U460DC" +} -result "\x00\x04\x60\xDC" test encoding-18.1 {TableToUtfProc} { } {} @@ -777,7 +797,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 89 : 88}] +} -result [expr {[info exists ::tcl_precision] ? 92 : 91}] runtests -- cgit v0.12 From 82df0e7cc9a71bbd7eb06fe4bd1ee9e979b40a13 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 13 Oct 2021 13:51:26 +0000 Subject: Oops --- generic/tclEncoding.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 4166e45..5549874 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2396,7 +2396,7 @@ Utf32ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - unsigned short ch; + int ch; flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { @@ -2436,7 +2436,7 @@ Utf32ToUtfProc( * unsigned short-size data. */ - if (ch && ch < 0x80) { + if ((ch > 0) && (ch < 0x80)) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); -- cgit v0.12 From 7e82249b7fbe90827c600cfd7fd3977ba18a54e2 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 14 Oct 2021 14:32:39 +0000 Subject: Code/Comment cleanup --- generic/tclEncoding.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5549874..fad9faa 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -37,7 +37,7 @@ typedef struct { * end-of-string in this encoding. This number * is used to determine the source string * length when the srcLen argument is - * negative. This number can be 1 or 2. */ + * negative. This number can be 1, 2, or 4. */ ClientData clientData; /* Arbitrary value associated with encoding * type. Passed to conversion functions. */ LengthProc *lengthProc; /* Function to compute length of @@ -46,8 +46,8 @@ typedef struct { * nullSize is 2, this is a function that * returns the number of bytes in a 0x0000 * terminated string; if nullSize is 4, this - * is a function that returns the number of bytes - * in a 0x00000000 terminated string. */ + * is a function that returns the number of + * bytes in a 0x00000000 terminated string. */ size_t refCount; /* Number of uses of this structure. */ Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */ } Encoding; @@ -218,8 +218,8 @@ static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, static Tcl_EncodingFreeProc TableFreeProc; static Tcl_EncodingConvertProc TableFromUtfProc; static Tcl_EncodingConvertProc TableToUtfProc; -static size_t char16len(const char *src); -static size_t unilen(const char *src); +static size_t unilen(const char *src); +static size_t unilen4(const char *src); static Tcl_EncodingConvertProc Utf32ToUtfProc; static Tcl_EncodingConvertProc UtfToUtf32Proc; static Tcl_EncodingConvertProc Utf16ToUtfProc; @@ -1077,9 +1077,9 @@ Tcl_CreateEncoding( encodingPtr->nullSize = typePtr->nullSize; encodingPtr->clientData = typePtr->clientData; if (typePtr->nullSize == 2) { - encodingPtr->lengthProc = (LengthProc *) char16len; - } else if (typePtr->nullSize == 4) { encodingPtr->lengthProc = (LengthProc *) unilen; + } else if (typePtr->nullSize == 4) { + encodingPtr->lengthProc = (LengthProc *) unilen4; } else { encodingPtr->lengthProc = (LengthProc *) strlen; } @@ -1364,7 +1364,7 @@ Tcl_UtfToExternalDString( src += srcRead; if (result != TCL_CONVERT_NOSPACE) { - int i = soFar + encodingPtr->nullSize - 1; + int i = soFar + encodingPtr->nullSize - 1; while (i >= soFar) { Tcl_DStringSetLength(dstPtr, i--); } @@ -2501,7 +2501,7 @@ UtfToUtf32Proc( } dstStart = dst; - dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); flags |= PTR2INT(clientData); result = TCL_OK; @@ -2540,6 +2540,7 @@ UtfToUtf32Proc( *dst++ = (ch & 0xFF); } } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2861,7 +2862,7 @@ UtfToUcs2Proc( *dstCharsPtr = numChars; return result; } - + /* *------------------------------------------------------------------------- * @@ -3303,7 +3304,7 @@ TableFreeProc( ClientData clientData) /* TableEncodingData that specifies * encoding. */ { - TableEncodingData *dataPtr = (TableEncodingData *) clientData; + TableEncodingData *dataPtr = (TableEncodingData *)clientData; /* * Make sure we aren't freeing twice on shutdown. [Bug 219314] @@ -3361,7 +3362,7 @@ EscapeToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; const unsigned short *const *tableToUnicode; const Encoding *encodingPtr; @@ -3838,7 +3839,7 @@ GetTableEncoding( /* *--------------------------------------------------------------------------- * - * unilen/char16len -- + * unilen, unilen4 -- * * A helper function for the Tcl_ExternalToUtf functions. This function * is similar to strlen for double-byte characters: it returns the number @@ -3854,7 +3855,7 @@ GetTableEncoding( */ static size_t -char16len( +unilen( const char *src) { unsigned short *p; @@ -3867,13 +3868,13 @@ char16len( } static size_t -unilen( +unilen4( const char *src) { unsigned int *p; p = (unsigned int *) src; - while (*p != 0x0000) { + while (*p != 0x00000000) { p++; } return (char *) p - src; @@ -3909,7 +3910,7 @@ InitializeEncodingSearchPath( Tcl_Encoding *encodingPtr) { const char *bytes; - int i, numDirs; + int i, numDirs, numBytes; Tcl_Obj *libPathObj, *encodingObj, *searchPathObj; TclNewLiteralStringObj(encodingObj, "encoding"); @@ -3939,11 +3940,11 @@ InitializeEncodingSearchPath( if (*encodingPtr) { ((Encoding *)(*encodingPtr))->refCount++; } - bytes = TclGetString(searchPathObj); + bytes = Tcl_GetStringFromObj(searchPathObj, &numBytes); - *lengthPtr = searchPathObj->length; - *valuePtr = (char *)ckalloc(*lengthPtr + 1); - memcpy(*valuePtr, bytes, *lengthPtr + 1); + *lengthPtr = numBytes; + *valuePtr = (char *)ckalloc(numBytes + 1); + memcpy(*valuePtr, bytes, numBytes + 1); Tcl_DecrRefCount(searchPathObj); } -- cgit v0.12