diff options
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 298 |
1 files changed, 98 insertions, 200 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 3ef4e58..8a29722 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -220,14 +220,7 @@ static size_t unilen(const char *src); static Tcl_EncodingConvertProc Utf16ToUtfProc; static Tcl_EncodingConvertProc UtfToUtf16Proc; static Tcl_EncodingConvertProc UtfToUcs2Proc; -static int UtfToUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, - int dstLen, int *srcReadPtr, - int *dstWrotePtr, int *dstCharsPtr, - int pureNullMode); -static Tcl_EncodingConvertProc UtfIntToUtfExtProc; -static Tcl_EncodingConvertProc UtfExtToUtfIntProc; +static Tcl_EncodingConvertProc UtfToUtfProc; static Tcl_EncodingConvertProc Iso88591FromUtfProc; static Tcl_EncodingConvertProc Iso88591ToUtfProc; @@ -517,6 +510,10 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ +/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ +#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ +#define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */ + void TclInitEncodingSubsystem(void) { @@ -525,7 +522,7 @@ TclInitEncodingSubsystem(void) unsigned size; unsigned short i; union { - char c; + unsigned char c; short s; } isLe; @@ -533,7 +530,7 @@ TclInitEncodingSubsystem(void) return; } - isLe.s = 1; + isLe.s = TCL_ENCODING_LE; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); @@ -553,8 +550,8 @@ TclInitEncodingSubsystem(void) tclIdentityEncoding = Tcl_CreateEncoding(&type); type.encodingName = "utf-8"; - type.toUtfProc = UtfExtToUtfIntProc; - type.fromUtfProc = UtfIntToUtfExtProc; + type.toUtfProc = UtfToUtfProc; + type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; type.clientData = NULL; @@ -565,7 +562,7 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(1); + type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2be"; type.clientData = INT2PTR(0); @@ -579,7 +576,7 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "utf-16le"; - type.clientData = INT2PTR(1); + type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); @@ -1077,7 +1074,7 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED; while (1) { result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, @@ -1186,25 +1183,24 @@ Tcl_ExternalToUtf( if (!noTerminate) { /* * If there are any null characters in the middle of the buffer, - * they will converted to the UTF-8 null character (\xC080). To get + * they will converted to the UTF-8 null character (\xC0\x80). To get * the actual \0 at the end of the destination buffer, we need to * append it manually. First make room for it... */ dstLen--; } + flags |= TCL_ENCODING_MODIFIED; do { - int savedFlags = flags; Tcl_EncodingState savedState = *statePtr; result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + flags , statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); if (*dstCharsPtr <= maxChars) { break; } dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); - flags = savedFlags; *statePtr = savedState; } while (1); if (!noTerminate) { @@ -1269,8 +1265,8 @@ Tcl_UtfToExternalDString( flags = TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, - srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, - &dstChars); + srcLen, flags, &state, dst, dstLen, + &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); if (result != TCL_CONVERT_NOSPACE) { @@ -1371,8 +1367,8 @@ Tcl_UtfToExternal( dstLen -= encodingPtr->nullSize; result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, - flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, - dstCharsPtr); + flags, statePtr, dst, dstLen, srcReadPtr, + dstWrotePtr, dstCharsPtr); if (encodingPtr->nullSize == 2) { dst[*dstWrotePtr + 1] = '\0'; } @@ -2095,104 +2091,6 @@ BinaryProc( /* *------------------------------------------------------------------------- * - * UtfIntToUtfExtProc -- - * - * Convert from UTF-8 to UTF-8. While converting null-bytes from the - * Tcl's internal representation (0xC0, 0x80) to the official - * representation (0x00). See UtfToUtfProc for details. - * - * Results: - * Returns TCL_OK if conversion was successful. - * - * Side effects: - * None. - * - *------------------------------------------------------------------------- - */ - -static int -UtfIntToUtfExtProc( - ClientData clientData, - const char *src, /* Source string in UTF-8. */ - int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ - char *dst, /* Output buffer in which converted string - * is stored. */ - int dstLen, /* The maximum length of output buffer in - * bytes. */ - int *srcReadPtr, /* Filled with the number of bytes from the - * source string that were converted. This may - * be less than the original source length if - * there was a problem converting some source - * characters. */ - int *dstWrotePtr, /* Filled with the number of bytes that were - * stored in the output buffer as a result of - * the conversion. */ - int *dstCharsPtr) /* Filled with the number of characters that - * correspond to the bytes stored in the - * output buffer. */ -{ - return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 1); -} - -/* - *------------------------------------------------------------------------- - * - * UtfExtToUtfIntProc -- - * - * Convert from UTF-8 to UTF-8 while converting null-bytes from the - * official representation (0x00) to Tcl's internal representation (0xC0, - * 0x80). See UtfToUtfProc for details. - * - * Results: - * Returns TCL_OK if conversion was successful. - * - * Side effects: - * None. - * - *------------------------------------------------------------------------- - */ - -static int -UtfExtToUtfIntProc( - ClientData clientData, - const char *src, /* Source string in UTF-8. */ - int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ - char *dst, /* Output buffer in which converted string is - * stored. */ - int dstLen, /* The maximum length of output buffer in - * bytes. */ - int *srcReadPtr, /* Filled with the number of bytes from the - * source string that were converted. This may - * be less than the original source length if - * there was a problem converting some source - * characters. */ - int *dstWrotePtr, /* Filled with the number of bytes that were - * stored in the output buffer as a result of - * the conversion. */ - int *dstCharsPtr) /* Filled with the number of characters that - * correspond to the bytes stored in the - * output buffer. */ -{ - return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 0); -} - -/* - *------------------------------------------------------------------------- - * * UtfToUtfProc -- * * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation @@ -2214,11 +2112,7 @@ UtfToUtfProc( const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2231,21 +2125,15 @@ UtfToUtfProc( int *dstWrotePtr, /* Filled with the number of bytes that were * stored in the output buffer as a result of * the conversion. */ - int *dstCharsPtr, /* Filled with the number of characters that + int *dstCharsPtr) /* Filled with the number of characters that * correspond to the bytes stored in the * output buffer. */ - int pureNullMode) /* Convert embedded nulls from internal - * representation to real null-bytes or vice - * versa. Also combine or separate surrogate pairs */ { const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; - int *chPtr = (int *) statePtr; + int ch; - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } result = TCL_OK; srcStart = src; @@ -2262,7 +2150,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2275,48 +2163,62 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) { + if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && (flags & TCL_ENCODING_MODIFIED))) { /* * Copy 7bit characters, but skip null-bytes when we are in input * mode, so that they get converted to 0xC080. */ *dst++ = *src++; - } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 && - (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { + } else if (UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) + && UCHAR(src[1]) == 0x80 && !(flags & TCL_ENCODING_MODIFIED)) { /* * Convert 0xC080 to real nulls when we are in output mode. */ *dst++ = 0; src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an - * incomplete char its bytes are made to represent themselves. + * incomplete char its bytes are made to represent themselves + * unless the user has explicitly asked to be told. */ - *chPtr = UCHAR(*src); + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_MULTIBYTE; + break; + } + ch = UCHAR(*src); src += 1; - dst += Tcl_UniCharToUtf(*chPtr, dst); + dst += Tcl_UniCharToUtf(ch, dst); } else { - src += TclUtfToUCS4(src, chPtr); - if ((*chPtr | 0x7FF) == 0xDFFF) { - /* A surrogate character is detected, handle especially */ - int low = *chPtr; - size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { - *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((*chPtr | 0x80) & 0xBF); - continue; + size_t len = TclUtfToUCS4(src, &ch); + if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) { + result = TCL_CONVERT_SYNTAX; + break; + } + src += len; + if ((ch | 0x7FF) == 0xDFFF) { + /* + * A surrogate character is detected, handle especially. + */ + + int low = ch; + len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; + + if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((ch | 0x80) & 0xBF); + continue; } src += len; - dst += Tcl_UniCharToUtf(*chPtr, dst); - *chPtr = low; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } - dst += Tcl_UniCharToUtf(*chPtr, dst); + dst += Tcl_UniCharToUtf(ch, dst); } } @@ -2344,7 +2246,7 @@ UtfToUtfProc( static int Utf16ToUtfProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2370,18 +2272,27 @@ Utf16ToUtfProc( int result, numChars, charLimit = INT_MAX; unsigned short ch; + flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } result = TCL_OK; - /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ + /* + * Check alignment with utf-16 (2 == sizeof(UTF-16)) + */ + if ((srcLen % 2) != 0) { result = TCL_CONVERT_MULTIBYTE; srcLen--; } - /* If last code point is a high surrogate, we cannot handle that yet */ - if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + + /* + * If last code point is a high surrogate, we cannot handle that yet. + */ + + if ((srcLen >= 2) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } @@ -2398,15 +2309,17 @@ Utf16ToUtfProc( break; } - if (clientData) { + if (flags & TCL_ENCODING_LE) { ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } + /* * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ + if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { @@ -2443,11 +2356,7 @@ UtfToUtf16Proc( const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2466,11 +2375,8 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + int ch; - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -2496,38 +2402,27 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUniChar(src, chPtr); - + src += TclUtfToUCS4(src, &ch); if (clientData) { -#if TCL_UTF_MAX > 3 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); + if (ch <= 0xFFFF) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); } else { - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (*chPtr & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; } -#else - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); -#endif } else { -#if TCL_UTF_MAX > 3 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); + if (ch <= 0xFFFF) { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); } else { - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; - *dst++ = (*chPtr & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; + *dst++ = (ch & 0xFF); } -#else - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); -#endif } } *srcReadPtr = src - srcStart; @@ -2554,7 +2449,7 @@ UtfToUtf16Proc( static int UtfToUcs2Proc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2582,6 +2477,7 @@ UtfToUcs2Proc( #endif Tcl_UniChar ch = 0; + flags |= PTR2INT(clientData); srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -2625,7 +2521,7 @@ UtfToUcs2Proc( * casting dst to a Tcl_UniChar. [Bug 1122671] */ - if (clientData) { + if (flags & TCL_ENCODING_LE) { *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); } else { @@ -3033,7 +2929,9 @@ Iso88591FromUtfProc( break; } #if TCL_UTF_MAX <= 3 - if ((ch >= 0xD800) && (len < 3)) len = 4; + if ((ch >= 0xD800) && (len < 3)) { + len = 4; + } #endif /* * Plunge on, using '?' as a fallback character. @@ -3078,7 +2976,7 @@ TableFreeProc( ClientData clientData) /* TableEncodingData that specifies * encoding. */ { - TableEncodingData *dataPtr = (TableEncodingData *)clientData; + TableEncodingData *dataPtr = (TableEncodingData *) clientData; /* * Make sure we aren't freeing twice on shutdown. [Bug 219314] @@ -3136,7 +3034,7 @@ EscapeToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData; const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; const unsigned short *const *tableToUnicode; const Encoding *encodingPtr; |