diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-12 20:39:42 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-12 20:39:42 (GMT) |
| commit | da96c5304100d70d932bcb73796c068d7e416cae (patch) | |
| tree | 70ebb652915b8a287b4fca9dc7e7f8e8165ba3c2 /generic/tclEncoding.c | |
| parent | 8e7a963f7fb10cc556337a18a652fd0c78c51029 (diff) | |
| download | tcl-da96c5304100d70d932bcb73796c068d7e416cae.zip tcl-da96c5304100d70d932bcb73796c068d7e416cae.tar.gz tcl-da96c5304100d70d932bcb73796c068d7e416cae.tar.bz2 | |
Even better support for -DTCL_UTF_MAX=6. Ongoing improvements (TIP being planned)
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 138 |
1 files changed, 127 insertions, 11 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 7eb73e8..74e7513 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -234,12 +234,17 @@ static int TableToUtfProc(ClientData clientData, const char *src, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); static size_t unilen(const char *src); -static int UniCharToUtfProc(ClientData clientData, +static int Utf16ToUtfProc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); -static int UtfToUniCharProc(ClientData clientData, +static int UtfToUtf16Proc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int UtfToUcs2Proc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, @@ -595,14 +600,27 @@ TclInitEncodingSubsystem(void) type.clientData = NULL; Tcl_CreateEncoding(&type); - type.encodingName = "unicode"; - type.toUtfProc = UniCharToUtfProc; - type.fromUtfProc = UtfToUniCharProc; + type.encodingName = "ucs-2"; + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUcs2Proc; type.freeProc = NULL; type.nullSize = 2; type.clientData = NULL; Tcl_CreateEncoding(&type); + type.encodingName = "utf-16"; + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUtf16Proc; + type.freeProc = NULL; + type.nullSize = 2; + type.clientData = NULL; + Tcl_CreateEncoding(&type); + +#ifndef TCL_NO_DEPRECATED + type.encodingName = "unicode"; + Tcl_CreateEncoding(&type); +#endif + /* * Need the iso8859-1 encoding in order to process binary data, so force * it to always be embedded. Note that this encoding *must* be a proper @@ -2401,9 +2419,9 @@ UtfToUtfProc( /* *------------------------------------------------------------------------- * - * UniCharToUtfProc -- + * Utf16ToUtfProc -- * - * Convert from Unicode to UTF-8. + * Convert from UTF-16 to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2415,7 +2433,7 @@ UtfToUtfProc( */ static int -UniCharToUtfProc( +Utf16ToUtfProc( ClientData clientData, /* Not used. */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ @@ -2491,9 +2509,9 @@ UniCharToUtfProc( /* *------------------------------------------------------------------------- * - * UtfToUniCharProc -- + * UtfToUtf16Proc -- * - * Convert from UTF-8 to Unicode. + * Convert from UTF-8 to UTF-16. * * Results: * Returns TCL_OK if conversion was successful. @@ -2505,7 +2523,7 @@ UniCharToUtfProc( */ static int -UtfToUniCharProc( +UtfToUtf16Proc( ClientData clientData, /* TableEncodingData that specifies * encoding. */ const char *src, /* Source string in UTF-8. */ @@ -2612,6 +2630,104 @@ UtfToUniCharProc( /* *------------------------------------------------------------------------- * + * UtfToUcs2Proc -- + * + * Convert from UTF-8 to UCS-2. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +UtfToUcs2Proc( + ClientData clientData, /* TableEncodingData that specifies + * encoding. */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars, len; + Tcl_UniChar ch = 0; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + src += (len = TclUtfToUniChar(src, &ch)); + if ((ch >= 0xD800) && (len < 3)) { + src += TclUtfToUniChar(src, &ch); + ch = 0xFFFD; + } + + /* + * Need to handle this in a way that won't cause misalignment by + * casting dst to a Tcl_UniChar. [Bug 1122671] + */ + +#ifdef WORDS_BIGENDIAN + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); +#else + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); +#endif + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * * TableToUtfProc -- * * Convert from the encoding specified by the TableEncodingData into |
