diff options
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 136 |
1 files changed, 117 insertions, 19 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 3a6385f..fd2337b 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -515,10 +515,8 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ /* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ -#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ @@ -1080,11 +1078,57 @@ Tcl_ExternalToUtfDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, 0, dstPtr); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_ExternalToUtfDStringEx -- + * + * Convert a source buffer from the specified encoding into UTF-8. +* The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in utf-8. + * Possible flags values: + * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but + * return the first error position (Default in Tcl 9.0). + * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default + * fallback character. Always return -1 (Default in Tcl 8.7). + * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. + * Only valid for "utf-8" and "cesu-8". This flag may be used together + * with the other flags. + * + * Results: + * The converted bytes are stored in the DString, which is then NULL + * terminated in an encoding-specific manner. The return value is + * the error position in the source string or -1 if no conversion error + * is reported. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +size_t +Tcl_ExternalToUtfDStringEx( + Tcl_Encoding encoding, /* The encoding for the source string, or NULL + * for the default system encoding. */ + const char *src, /* Source string in specified encoding. */ + size_t srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for + * encoding-specific string length. */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + * converted string is stored. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, result, soFar, srcRead, dstWrote, dstChars; + int result, soFar, srcRead, dstWrote, dstChars; size_t dstLen; + const char *srcStart = src; Tcl_DStringInit(dstPtr); dst = Tcl_DStringValue(dstPtr); @@ -1101,7 +1145,7 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } @@ -1114,7 +1158,7 @@ Tcl_ExternalToUtfDString( src += srcRead; if (result != TCL_CONVERT_NOSPACE) { Tcl_DStringSetLength(dstPtr, soFar); - return Tcl_DStringValue(dstPtr); + return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart); } flags &= ~TCL_ENCODING_START; srcLen -= srcRead; @@ -1273,10 +1317,57 @@ Tcl_UtfToExternalDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_UtfToExternalDStringEx(encoding, src, srcLen, 0, dstPtr); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_UtfToExternalDStringEx -- + * + * Convert a source buffer from UTF-8 to the specified encoding. + * The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in the + * target encoding. + * Possible flags values: + * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but + * return the first error position (Default in Tcl 9.0). + * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default + * fallback character. Always return -1 (Default in Tcl 8.7). + * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. + * Only valid for "utf-8" and "cesu-8". This flag may be used together + * with the other flags. + * + * Results: + * The converted bytes are stored in the DString, which is then NULL + * terminated in an encoding-specific manner. The return value is + * the error position in the source string or -1 if no conversion error + * is reported. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +size_t +Tcl_UtfToExternalDStringEx( + Tcl_Encoding encoding, /* The encoding for the converted string, or + * NULL for the default system encoding. */ + const char *src, /* Source string in UTF-8. */ + size_t srcLen, /* Source string length in bytes, or < 0 for + * strlen(). */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + * converted string is stored. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, result, soFar, srcRead, dstWrote, dstChars; + int result, soFar, srcRead, dstWrote, dstChars; + const char *srcStart = src; size_t dstLen; Tcl_DStringInit(dstPtr); @@ -1293,7 +1384,7 @@ Tcl_UtfToExternalDString( } else if (srcLen == TCL_INDEX_NONE) { srcLen = strlen(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, &state, dst, dstLen, @@ -1306,7 +1397,7 @@ Tcl_UtfToExternalDString( while (i >= soFar) { Tcl_DStringSetLength(dstPtr, i--); } - return Tcl_DStringValue(dstPtr); + return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart); } flags &= ~TCL_ENCODING_START; @@ -2134,6 +2225,12 @@ BinaryProc( *------------------------------------------------------------------------- */ +#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) +# define STOPONERROR !(flags & TCL_ENCODING_NOCOMPLAIN) +#else +# define STOPONERROR (flags & TCL_ENCODING_STOPONERROR) +#endif + static int UtfToUtfProc( ClientData clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ @@ -2216,7 +2313,7 @@ UtfToUtfProc( */ if (flags & TCL_ENCODING_MODIFIED) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_MULTIBYTE; break; } @@ -2231,7 +2328,7 @@ UtfToUtfProc( int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); - if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR) + if ((len < 2) && (ch != 0) && STOPONERROR && (flags & TCL_ENCODING_MODIFIED)) { result = TCL_CONVERT_SYNTAX; break; @@ -2256,7 +2353,8 @@ UtfToUtfProc( len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - if (flags & TCL_ENCODING_STOPONERROR) { + + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2271,7 +2369,7 @@ UtfToUtfProc( dst += Tcl_UniCharToUtf(ch, dst); ch = low; } else if (!Tcl_UniCharIsUnicode(ch)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2457,7 +2555,7 @@ UtfToUtf32Proc( } len = TclUtfToUCS4(src, &ch); if (!Tcl_UniCharIsUnicode(ch)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -2660,7 +2758,7 @@ UtfToUtf16Proc( } len = TclUtfToUCS4(src, &ch); if (!Tcl_UniCharIsUnicode(ch)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -2880,7 +2978,7 @@ TableToUtfProc( ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_SYNTAX; break; } @@ -2996,7 +3094,7 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3184,7 +3282,7 @@ Iso88591FromUtfProc( || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3411,7 +3509,7 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if ((flags & TCL_ENCODING_STOPONERROR) == 0) { + if (!STOPONERROR) { /* * Skip the unknown escape sequence. */ @@ -3586,7 +3684,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } |