diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-08-14 15:13:41 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-08-14 15:13:41 (GMT) |
commit | c299d96628a0804e333ace7e60b5cb840877e8f4 (patch) | |
tree | ad00c51935ccc838b6959be83d856ab0395606bb /generic/tclEncoding.c | |
parent | 8ffde10c063dd49dd207d2c8cf8b09e4487edf18 (diff) | |
parent | c9376306301e578615cfee52d2121f78cb31a225 (diff) | |
download | tcl-c299d96628a0804e333ace7e60b5cb840877e8f4.zip tcl-c299d96628a0804e333ace7e60b5cb840877e8f4.tar.gz tcl-c299d96628a0804e333ace7e60b5cb840877e8f4.tar.bz2 |
Merge 8.7
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 226 |
1 files changed, 185 insertions, 41 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 590b0de..cb9c26b 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -234,12 +234,17 @@ static int TableToUtfProc(ClientData clientData, const char *src, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); static size_t unilen(const char *src); -static int UniCharToUtfProc(ClientData clientData, +static int Utf16ToUtfProc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); -static int UtfToUniCharProc(ClientData clientData, +static int UtfToUtf16Proc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int UtfToUcs2Proc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, @@ -564,11 +569,16 @@ TclInitEncodingSubsystem(void) TableEncodingData *dataPtr; unsigned size; unsigned short i; + union { + char c; + short s; + } isLe; if (encodingsInitialized) { return; } + isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); @@ -595,13 +605,38 @@ TclInitEncodingSubsystem(void) type.clientData = NULL; Tcl_CreateEncoding(&type); - type.encodingName = "unicode"; - type.toUtfProc = UniCharToUtfProc; - type.fromUtfProc = UtfToUniCharProc; + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUcs2Proc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = NULL; + type.encodingName = "ucs-2le"; + type.clientData = INT2PTR(1); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2"; + type.clientData = INT2PTR(isLe.c); + Tcl_CreateEncoding(&type); + + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUtf16Proc; + type.freeProc = NULL; + type.nullSize = 2; + type.encodingName = "utf-16le"; + type.clientData = INT2PTR(1);; + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16"; + type.clientData = INT2PTR(isLe.c);; + Tcl_CreateEncoding(&type); + +#ifndef TCL_NO_DEPRECATED + type.encodingName = "unicode"; Tcl_CreateEncoding(&type); +#endif /* * Need the iso8859-1 encoding in order to process binary data, so force @@ -1279,7 +1314,7 @@ Tcl_ExternalToUtf( if (*dstCharsPtr <= maxChars) { break; } - dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX; + dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); flags = savedFlags; *statePtr = savedState; } while (1); @@ -2401,9 +2436,9 @@ UtfToUtfProc( /* *------------------------------------------------------------------------- * - * UniCharToUtfProc -- + * Utf16ToUtfProc -- * - * Convert from Unicode to UTF-8. + * Convert from UTF-16 to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2415,8 +2450,8 @@ UtfToUtfProc( */ static int -UniCharToUtfProc( - ClientData clientData, /* Not used. */ +Utf16ToUtfProc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2468,12 +2503,15 @@ UniCharToUtfProc( break; } + if (clientData) { + ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); + } /* * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ - - ch = *(unsigned short *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { @@ -2491,9 +2529,9 @@ UniCharToUtfProc( /* *------------------------------------------------------------------------- * - * UtfToUniCharProc -- + * UtfToUtf16Proc -- * - * Convert from UTF-8 to Unicode. + * Convert from UTF-8 to UTF-16. * * Results: * Returns TCL_OK if conversion was successful. @@ -2505,9 +2543,8 @@ UniCharToUtfProc( */ static int -UtfToUniCharProc( - ClientData clientData, /* TableEncodingData that specifies - * encoding. */ +UtfToUtf16Proc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2571,44 +2608,151 @@ UtfToUniCharProc( * casting dst to a Tcl_UniChar. [Bug 1122671] */ -#ifdef WORDS_BIGENDIAN + if (clientData) { #if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); - } else { - *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; - *dst++ = (*chPtr & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - } -#else - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); -#endif + if (*chPtr <= 0xFFFF) { + *dst++ = (*chPtr & 0xFF); + *dst++ = (*chPtr >> 8); + } else { + *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); + *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (*chPtr & 0xFF); + *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; + } #else -#if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { *dst++ = (*chPtr & 0xFF); *dst++ = (*chPtr >> 8); +#endif } else { - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; +#if TCL_UTF_MAX > 4 + if (*chPtr <= 0xFFFF) { + *dst++ = (*chPtr >> 8); + *dst++ = (*chPtr & 0xFF); + } else { + *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; + *dst++ = (*chPtr & 0xFF); + *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); + } +#else + *dst++ = (*chPtr >> 8); *dst++ = (*chPtr & 0xFF); - *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; +#endif } -#else - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * + * UtfToUcs2Proc -- + * + * Convert from UTF-8 to UCS-2. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +UtfToUcs2Proc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars; +#if TCL_UTF_MAX <= 4 + int len; #endif + Tcl_UniChar ch = 0; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } +#if TCL_UTF_MAX <= 4 + src += (len = TclUtfToUniChar(src, &ch)); + if ((ch >= 0xD800) && (len < 3)) { + src += TclUtfToUniChar(src, &ch); + ch = 0xFFFD; + } +#else + src += TclUtfToUniChar(src, &ch); + if (ch > 0xFFFF) { + ch = 0xFFFD; + } #endif + + /* + * Need to handle this in a way that won't cause misalignment by + * casting dst to a Tcl_UniChar. [Bug 1122671] + */ + + if (clientData) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); + } else { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); + } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; return result; } - + /* *------------------------------------------------------------------------- * |