diff options
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 992 |
1 files changed, 799 insertions, 193 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 23d71ff..a87eb7f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -10,6 +10,7 @@ */ #include "tclInt.h" +#include "tclIO.h" typedef size_t (LengthProc)(const char *src); @@ -35,7 +36,7 @@ typedef struct { * encoding is deleted. */ void *clientData; /* Arbitrary value associated with encoding * type. Passed to conversion functions. */ - size_t nullSize; /* Number of 0x00 bytes that signify + Tcl_Size nullSize; /* Number of 0x00 bytes that signify * end-of-string in this encoding. This number * is used to determine the source string * length when the srcLen argument is @@ -187,6 +188,36 @@ static Tcl_Encoding systemEncoding = NULL; Tcl_Encoding tclIdentityEncoding = NULL; /* + * Names of encoding profiles and corresponding integer values. + * Keep alphabetical order for error messages. + */ +static struct TclEncodingProfiles { + const char *name; + int value; +} encodingProfiles[] = { + {"replace", TCL_ENCODING_PROFILE_REPLACE}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, +}; +#define PROFILE_TCL8(flags_) \ + ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8) \ + || (CHANNEL_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_TCL8)) +#define PROFILE_STRICT(flags_) \ + ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ + || (CHANNEL_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) +#define PROFILE_REPLACE(flags_) \ + ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) \ + || (CHANNEL_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE)) + +#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD) +#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) +#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) +#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) + +/* * The following variable is used in the sparse matrix code for a * TableEncoding to represent a page in the table that has no entries. */ @@ -229,6 +260,7 @@ static Tcl_EncodingConvertProc UtfToUtfProc; static Tcl_EncodingConvertProc Iso88591FromUtfProc; static Tcl_EncodingConvertProc Iso88591ToUtfProc; + /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field * of the internalrep. This should help the lifetime of encodings be more useful. @@ -374,7 +406,7 @@ int Tcl_SetEncodingSearchPath( Tcl_Obj *searchPath) { - size_t dummy; + Tcl_Size dummy; if (TCL_ERROR == TclListObjLengthM(NULL, searchPath, &dummy)) { return TCL_ERROR; @@ -421,7 +453,7 @@ void TclSetLibraryPath( Tcl_Obj *path) { - size_t dummy; + Tcl_Size dummy; if (TCL_ERROR == TclListObjLengthM(NULL, path, &dummy)) { return; @@ -457,7 +489,7 @@ TclSetLibraryPath( static void FillEncodingFileMap(void) { - size_t i, numDirs = 0; + Tcl_Size i, numDirs = 0; Tcl_Obj *map, *searchPath; searchPath = Tcl_GetEncodingSearchPath(); @@ -472,7 +504,7 @@ FillEncodingFileMap(void) * entries found, we favor files earlier on the search path. */ - size_t j, numFiles; + Tcl_Size j, numFiles; Tcl_Obj *directory, *matchFileList; Tcl_Obj **filev; Tcl_GlobTypeData readableFiles = { @@ -521,10 +553,16 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and - * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ -#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ -#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ +/* + * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS + * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this + * when adding bits. TODO - should really be defined in a single file. + * + * To prevent conflicting bits, only define bits within 0xff00 mask here. + */ +#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */ +#define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ +#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ void TclInitEncodingSubsystem(void) @@ -537,12 +575,16 @@ TclInitEncodingSubsystem(void) char c; short s; } isLe; + int leFlags; if (encodingsInitialized) { return; } - isLe.s = TCL_ENCODING_LE; + /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */ + isLe.s = 1; + leFlags = isLe.c ? TCL_ENCODING_LE : 0; + Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); @@ -566,9 +608,9 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = INT2PTR(TCL_ENCODING_UTF); + type.clientData = INT2PTR(ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(0); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); @@ -577,13 +619,13 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(TCL_ENCODING_LE|TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2be"; - type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2"; - type.clientData = INT2PTR(isLe.c|TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(leFlags); Tcl_CreateEncoding(&type); type.toUtfProc = Utf32ToUtfProc; @@ -597,7 +639,7 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "utf-32"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(leFlags); Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; @@ -605,13 +647,13 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "utf-16le"; - type.clientData = INT2PTR(TCL_ENCODING_LE); + type.clientData = INT2PTR(TCL_ENCODING_LE|ENCODING_UTF); Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; - type.clientData = INT2PTR(0); + type.clientData = INT2PTR(ENCODING_UTF); Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(leFlags|ENCODING_UTF); Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED @@ -895,7 +937,7 @@ Tcl_GetEncodingNames( Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); Tcl_CreateHashEntry(&table, - Tcl_NewStringObj(encodingPtr->name, -1), &dummy); + Tcl_NewStringObj(encodingPtr->name, TCL_INDEX_NONE), &dummy); } Tcl_MutexUnlock(&encodingMutex); @@ -940,7 +982,7 @@ Tcl_GetEncodingNames( * *--------------------------------------------------------------------------- */ -size_t +Tcl_Size Tcl_GetEncodingNulLength( Tcl_Encoding encoding) { @@ -1107,12 +1149,13 @@ Tcl_ExternalToUtfDString( Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ - size_t srcLen, /* Source string length in bytes, or -1 for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_ExternalToUtfDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1126,50 +1169,82 @@ Tcl_ExternalToUtfDString( * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in utf-8. * Possible flags values: - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * Any other flag bits will cause an error to be returned (for future + * compatibility) * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. - * + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. + * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner. + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ -size_t +int Tcl_ExternalToUtfDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ - size_t srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int result, soFar, srcRead, dstWrote, dstChars; - size_t dstLen; + int result; + Tcl_Size dstLen, soFar; const char *srcStart = src; + /* DO FIRST - Must always be initialized before returning */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetObjResult( + interp, + Tcl_NewStringObj( + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_INDEX_NONE)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; if (encoding == NULL) { encoding = systemEncoding; } - encodingPtr = (Encoding *) encoding; + encodingPtr = (Encoding *)encoding; if (src == NULL) { srcLen = 0; @@ -1177,23 +1252,70 @@ Tcl_ExternalToUtfDStringEx( srcLen = encodingPtr->lengthProc(src); } - flags |= TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START; if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + flags |= ENCODING_INPUT; } while (1) { - result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); - soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + int srcChunkLen, srcChunkRead; + int dstChunkLen, dstChunkWrote, dstChunkChars; + + if (srcLen > INT_MAX) { + srcChunkLen = INT_MAX; + } else { + srcChunkLen = srcLen; + flags |= TCL_ENCODING_END; /* Last chunk */ + } + dstChunkLen = dstLen > INT_MAX ? INT_MAX : dstLen; + + result = encodingPtr->toUtfProc(encodingPtr->clientData, src, + srcChunkLen, flags, &state, dst, dstChunkLen, + &srcChunkRead, &dstChunkWrote, &dstChunkChars); + soFar = dst + dstChunkWrote - Tcl_DStringValue(dstPtr); + + src += srcChunkRead; + + /* + * Keep looping in two case - + * - our destination buffer did not have enough room + * - we had not passed in all the data and error indicated fragment + * of a multibyte character + * In both cases we have to grow buffer, move the input source pointer + * and loop. Otherwise, return the result we got. + */ + if ((result != TCL_CONVERT_NOSPACE) && + !(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) { + Tcl_Size nBytesProcessed = (src - srcStart); - src += srcRead; - if (result != TCL_CONVERT_NOSPACE) { Tcl_DStringSetLength(dstPtr, soFar); - return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + char buf[TCL_INTEGER_SPACE]; + sprintf(buf, "%" TCL_Z_MODIFIER "u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf("unexpected byte sequence starting at index %" + TCL_Z_MODIFIER "u: '\\x%02X'", + nBytesProcessed, + UCHAR(srcStart[nBytesProcessed]))); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL); + } + } + return result; } + flags &= ~TCL_ENCODING_START; - srcLen -= srcRead; + srcLen -= srcChunkRead; + if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); } @@ -1227,8 +1349,8 @@ Tcl_ExternalToUtf( Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ - size_t srcLen, /* Source string length in bytes, or -1 - * for encoding-specific string length. */ + Tcl_Size srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for + * encoding-specific string length. */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state * information used during a piecewise @@ -1237,7 +1359,7 @@ Tcl_ExternalToUtf( * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ - size_t dstLen, /* The maximum length of output buffer in + Tcl_Size dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the * source string that were converted. This may @@ -1272,6 +1394,13 @@ Tcl_ExternalToUtf( flags |= TCL_ENCODING_START | TCL_ENCODING_END; statePtr = &state; } + if (srcLen > INT_MAX) { + srcLen = INT_MAX; + flags &= ~TCL_ENCODING_END; + } + if (dstLen > INT_MAX) { + dstLen = INT_MAX; + } if (srcReadPtr == NULL) { srcReadPtr = &srcRead; } @@ -1286,6 +1415,9 @@ Tcl_ExternalToUtf( } if (!noTerminate) { + if ((int) dstLen < 1) { + return TCL_CONVERT_NOSPACE; + } /* * If there are any null characters in the middle of the buffer, * they will converted to the UTF-8 null character (\xC0\x80). To get @@ -1294,9 +1426,13 @@ Tcl_ExternalToUtf( */ dstLen--; + } else { + if (dstLen <= 0 && srcLen > 0) { + return TCL_CONVERT_NOSPACE; + } } if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + flags |= ENCODING_INPUT; } do { Tcl_EncodingState savedState = *statePtr; @@ -1344,12 +1480,13 @@ Tcl_UtfToExternalDString( Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ - size_t srcLen, /* Source string length in bytes, or -1 for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_UtfToExternalDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1362,45 +1499,73 @@ Tcl_UtfToExternalDString( * Convert a source buffer from UTF-8 to the specified encoding. * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in the - * target encoding. - * Possible flags values: - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ -size_t +int Tcl_UtfToExternalDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ - size_t srcLen, /* Source string length in bytes, or < 0 for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ - int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int result, soFar, srcRead, dstWrote, dstChars; + int result; const char *srcStart = src; - size_t dstLen; + Tcl_Size dstLen, soFar; + /* DO FIRST - must always be initialized on return */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetObjResult( + interp, + Tcl_NewStringObj( + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_INDEX_NONE)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; @@ -1414,24 +1579,75 @@ Tcl_UtfToExternalDStringEx( } else if (srcLen == TCL_INDEX_NONE) { srcLen = strlen(src); } - flags |= TCL_ENCODING_START | TCL_ENCODING_END; + + flags |= TCL_ENCODING_START; while (1) { + int srcChunkLen, srcChunkRead; + int dstChunkLen, dstChunkWrote, dstChunkChars; + + if (srcLen > INT_MAX) { + srcChunkLen = INT_MAX; + } else { + srcChunkLen = srcLen; + flags |= TCL_ENCODING_END; /* Last chunk */ + } + dstChunkLen = dstLen > INT_MAX ? INT_MAX : dstLen; + result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, - srcLen, flags, &state, dst, dstLen, - &srcRead, &dstWrote, &dstChars); - soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); - - src += srcRead; - if (result != TCL_CONVERT_NOSPACE) { - int i = soFar + encodingPtr->nullSize - 1; - while (i >= soFar) { + srcChunkLen, flags, &state, dst, dstChunkLen, + &srcChunkRead, &dstChunkWrote, &dstChunkChars); + soFar = dst + dstChunkWrote - Tcl_DStringValue(dstPtr); + + /* Move past the part processed in this go around */ + src += srcChunkRead; + + /* + * Keep looping in two case - + * - our destination buffer did not have enough room + * - we had not passed in all the data and error indicated fragment + * of a multibyte character + * In both cases we have to grow buffer, move the input source pointer + * and loop. Otherwise, return the result we got. + */ + if ((result != TCL_CONVERT_NOSPACE) && + !(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) { + Tcl_Size nBytesProcessed = (src - srcStart); + size_t i = soFar + encodingPtr->nullSize - 1; + /* Loop as DStringSetLength only stores one nul byte at a time */ + while (i+1 >= soFar+1) { Tcl_DStringSetLength(dstPtr, i--); } - return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + Tcl_Size pos = Tcl_NumUtfChars(srcStart, nBytesProcessed); + int ucs4; + char buf[TCL_INTEGER_SPACE]; + TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4); + sprintf(buf, "%" TCL_Z_MODIFIER "u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "unexpected character at index %" TCL_Z_MODIFIER + "u: 'U+%06X'", + pos, + ucs4)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", + buf, NULL); + } + } + return result; } flags &= ~TCL_ENCODING_START; - srcLen -= srcRead; + srcLen -= srcChunkRead; + if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); } @@ -1465,8 +1681,8 @@ Tcl_UtfToExternal( Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ - size_t srcLen, /* Source string length in bytes, or -1 - * for strlen(). */ + Tcl_Size srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for + * strlen(). */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state * information used during a piecewise @@ -1475,7 +1691,7 @@ Tcl_UtfToExternal( * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string * is stored. */ - size_t dstLen, /* The maximum length of output buffer in + Tcl_Size dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the * source string that were converted. This may @@ -1507,6 +1723,13 @@ Tcl_UtfToExternal( flags |= TCL_ENCODING_START | TCL_ENCODING_END; statePtr = &state; } + if (srcLen > INT_MAX) { + srcLen = INT_MAX; + flags &= ~TCL_ENCODING_END; + } + if (dstLen > INT_MAX) { + dstLen = INT_MAX; + } if (srcReadPtr == NULL) { srcReadPtr = &srcRead; } @@ -1517,10 +1740,17 @@ Tcl_UtfToExternal( dstCharsPtr = &dstChars; } + if (dstLen < encodingPtr->nullSize) { + return TCL_CONVERT_NOSPACE; + } dstLen -= encodingPtr->nullSize; result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); + /* + * Buffer is terminated irrespective of result. Not sure this is + * reasonable but keep for historical/compatibility reasons. + */ memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize); return result; @@ -1580,17 +1810,17 @@ OpenEncodingFileChannel( const char *name) /* The name of the encoding file on disk and * also the name for new encoding. */ { - Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1); + Tcl_Obj *nameObj = Tcl_NewStringObj(name, TCL_INDEX_NONE); Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj); Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath()); Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap); Tcl_Obj **dir, *path, *directory = NULL; Tcl_Channel chan = NULL; - size_t i, numDirs; + Tcl_Size i, numDirs; TclListObjGetElementsM(NULL, searchPath, &numDirs, &dir); Tcl_IncrRefCount(nameObj); - Tcl_AppendToObj(fileNameObj, ".enc", -1); + Tcl_AppendToObj(fileNameObj, ".enc", TCL_INDEX_NONE); Tcl_IncrRefCount(fileNameObj); Tcl_DictObjGet(NULL, map, nameObj, &directory); @@ -1851,7 +2081,7 @@ LoadTableEncoding( for (i = 0; i < numPages; i++) { int ch; const char *p; - size_t expected = 3 + 16 * (16 * 4 + 1); + Tcl_Size expected = 3 + 16 * (16 * 4 + 1); if (Tcl_ReadChars(chan, objPtr, expected, 0) != expected) { return NULL; @@ -2087,7 +2317,7 @@ LoadEscapeEncoding( Tcl_DStringInit(&escapeData); while (1) { - size_t argc; + Tcl_Size argc; const char **argv; char *line; Tcl_DString lineString; @@ -2135,7 +2365,7 @@ LoadEscapeEncoding( Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est)); } } - Tcl_Free((void *)argv); + Tcl_Free(argv); Tcl_DStringFree(&lineString); } @@ -2222,6 +2452,7 @@ BinaryProc( if (dstLen < 0) { dstLen = 0; } + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { srcLen = *dstCharsPtr; } @@ -2242,9 +2473,9 @@ BinaryProc( * * UtfToUtfProc -- * - * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation - * is not a no-op, because it will turn a stream of improperly formed - * UTF-8 into a properly formed stream. + * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation + * is not a no-op, because it turns a stream of improperly formed + * UTF-8 into a properly-formed stream. * * Results: * Returns TCL_OK if conversion was successful. @@ -2255,14 +2486,12 @@ BinaryProc( *------------------------------------------------------------------------- */ -#define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN)) - static int UtfToUtfProc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ + void *clientData, /* additional flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ + int flags, /* TCL_ENCODING_* conversion control flags. */ TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ @@ -2284,12 +2513,14 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int ch; + int profile; result = TCL_OK; srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= 6; } @@ -2299,9 +2530,11 @@ UtfToUtfProc( dstStart = dst; flags |= PTR2INT(clientData); - dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); + dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); + profile = CHANNEL_PROFILE_GET(flags); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the @@ -2315,62 +2548,85 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) { + if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) { /* * Copy 7bit characters, but skip null-bytes when we are in input - * mode, so that they get converted to 0xC080. + * mode, so that they get converted to \xC0\x80. */ - *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { - /* - * If in input mode, and -strict is specified: This is an error. - */ - if (flags & TCL_ENCODING_MODIFIED) { - result = TCL_CONVERT_SYNTAX; - break; + } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && + (UCHAR(src[1]) == 0x80) && + (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || + PROFILE_REPLACE(profile))) { + /* Special sequence \xC0\x80 */ + if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) { + if (PROFILE_REPLACE(profile)) { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + src += 2; + } else { + /* PROFILE_STRICT */ + result = TCL_CONVERT_SYNTAX; + break; + } + } else { + /* + * Convert 0xC080 to real nulls when we are in output mode, + * irrespective of the profile. + */ + *dst++ = 0; + src += 2; } - /* - * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'. - */ - *dst++ = 0; - src += 2; } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* + * Incomplete byte sequence. * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves * unless the user has explicitly asked to be told. */ - if (flags & TCL_ENCODING_MODIFIED) { - if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { - result = TCL_CONVERT_MULTIBYTE; + if (flags & ENCODING_INPUT) { + /* Incomplete bytes for modified UTF-8 target */ + if (PROFILE_STRICT(profile)) { + result = (flags & TCL_ENCODING_CHAR_LIMIT) + ? TCL_CONVERT_MULTIBYTE + : TCL_CONVERT_SYNTAX; break; } - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { - result = TCL_CONVERT_SYNTAX; - break; } - ch = UCHAR(*src++); + if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + ++src; } else { + /* TCL_ENCODING_PROFILE_TCL8 */ char chbuf[2]; chbuf[0] = UCHAR(*src++); chbuf[1] = 0; TclUtfToUCS4(chbuf, &ch); } dst += Tcl_UniCharToUtf(ch, dst); } else { - const char *saveSrc = src; + int isInvalid = 0; size_t len = TclUtfToUCS4(src, &ch); - if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED) - && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { - result = TCL_CONVERT_SYNTAX; - break; + if (flags & ENCODING_INPUT) { + if ((len < 2) && (ch != 0)) { + isInvalid = 1; + } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) { + isInvalid = 1; + } + if (isInvalid) { + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_SYNTAX; + break; + } else if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } + } } + + const char *saveSrc = src; src += len; - if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { + if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) { if (ch > 0xFFFF) { /* CESU-8 6-byte sequence for chars > U+FFFF */ ch -= 0x10000; @@ -2387,33 +2643,45 @@ UtfToUtfProc( *dst++ = (char) ((ch | 0x80) & 0xBF); continue; #if TCL_UTF_MAX < 4 - } else if ((ch | 0x7FF) == 0xDFFF) { + } else if (SURROGATE(ch)) { /* * A surrogate character is detected, handle especially. */ - - int low = ch; - len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - - if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - - if (STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; + if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (PROFILE_REPLACE(profile)) { + /* TODO - is this right for cesu8 or should we fall through below? */ + ch = UNICODE_REPLACE_CHAR; + } else { + int low = ch; + len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0; + + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { + + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + goto cesu8; } - goto cesu8; + src += len; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } - src += len; - dst += Tcl_UniCharToUtf(ch, dst); - ch = low; #endif - } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) { + } else if (PROFILE_STRICT(profile) && + (!(flags & ENCODING_INPUT)) && + SURROGATE(ch)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) { + } else if (PROFILE_STRICT(profile) && + (flags & ENCODING_INPUT) && + SURROGATE(ch)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; break; @@ -2470,8 +2738,9 @@ Utf32ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - int ch; + int ch = 0, bytesLeft = srcLen % 4; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; @@ -2481,11 +2750,26 @@ Utf32ToUtfProc( /* * Check alignment with utf-32 (4 == sizeof(UTF-32)) */ + if (bytesLeft != 0) { + /* We have a truncated code unit */ + result = TCL_CONVERT_MULTIBYTE; + srcLen -= bytesLeft; + } - if ((srcLen % 4) != 0) { +#if TCL_UTF_MAX < 4 + /* + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. + */ + + if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) { result = TCL_CONVERT_MULTIBYTE; - srcLen &= -4; + srcLen-= 4; } +#endif srcStart = src; srcEnd = src + srcLen; @@ -2499,17 +2783,35 @@ Utf32ToUtfProc( break; } +#if TCL_UTF_MAX < 4 + int prev = ch; +#endif if (flags & TCL_ENCODING_LE) { ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } - if ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && ((ch & ~0x7FF) == 0xD800))) { - if (STOPONERROR) { +#if TCL_UTF_MAX < 4 + if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } +#endif + + if ((unsigned)ch > 0x10FFFF) { + ch = UNICODE_REPLACE_CHAR; + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } + } else if (PROFILE_STRICT(flags) && SURROGATE(ch)) { + result = TCL_CONVERT_SYNTAX; +#if TCL_UTF_MAX < 4 + ch = 0; +#endif + break; + } else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) { + ch = UNICODE_REPLACE_CHAR; } /* @@ -2517,12 +2819,39 @@ Utf32ToUtfProc( * unsigned short-size data. */ - if ((ch > 0) && (ch < 0x80)) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned int); + src += 4; + } + + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + */ +#if TCL_UTF_MAX < 4 + if (HIGH_SURROGATE(ch)) { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } +#endif + + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src += bytesLeft; /* Go past truncated code unit */ + } + } } *srcReadPtr = src - srcStart; @@ -2577,6 +2906,7 @@ UtfToUtf32Proc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -2601,11 +2931,14 @@ UtfToUtf32Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -2669,8 +3002,9 @@ Utf16ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - unsigned short ch; + unsigned short ch = 0; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; @@ -2687,10 +3021,11 @@ Utf16ToUtfProc( } /* - * If last code point is a high surrogate, we cannot handle that yet. + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. */ - if ((srcLen >= 2) && + if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) && ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; @@ -2702,29 +3037,100 @@ Utf16ToUtfProc( dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } + unsigned short prev = ch; if (flags & TCL_ENCODING_LE) { ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } + if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; /* Go back to beginning of high surrogate */ + dst--; /* Also undo writing a single byte too much */ + numChars--; + break; + } else if (PROFILE_REPLACE(flags)) { + /* + * Previous loop wrote a single byte to mark the high surrogate. + * Replace it with the replacement character. Further, restart + * current loop iteration since need to recheck destination space + * and reset processing of current character. + */ + ch = UNICODE_REPLACE_CHAR; + dst--; + dst += Tcl_UniCharToUtf(ch, dst); + src -= 2; + numChars--; + continue; + } else { + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } + } /* * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (ch & 0xFF); - } else { + } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) { dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); + } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) { + /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */ + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; + } else { + /* PROFILE_REPLACE */ + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + } + } else { + dst += Tcl_UniCharToUtf(ch, dst); + } + } + + if (HIGH_SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; + dst--; + numChars--; + } else if (PROFILE_REPLACE(flags)) { + dst--; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + } else { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } + } + + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + */ + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src++; /* Go past truncated code unit */ + } } - src += sizeof(unsigned short); } *srcReadPtr = src - srcStart; @@ -2779,6 +3185,7 @@ UtfToUtf16Proc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -2803,11 +3210,14 @@ UtfToUtf16Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -2878,12 +3288,10 @@ UtfToUcs2Proc( * output buffer. */ { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; - int result, numChars; -#if TCL_UTF_MAX < 4 - int len; -#endif + int result, numChars, len; Tcl_UniChar ch = 0; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); srcStart = src; srcEnd = src + srcLen; @@ -2911,17 +3319,32 @@ UtfToUcs2Proc( break; } #if TCL_UTF_MAX < 4 - src += (len = TclUtfToUniChar(src, &ch)); + len = TclUtfToUniChar(src, &ch); if ((ch >= 0xD800) && (len < 3)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; + } + src += len; src += TclUtfToUniChar(src, &ch); - ch = 0xFFFD; + ch = UNICODE_REPLACE_CHAR; } #else - src += TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); if (ch > 0xFFFF) { - ch = 0xFFFD; + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = UNICODE_REPLACE_CHAR; } #endif + if (PROFILE_STRICT(flags) && SURROGATE(ch)) { + result = TCL_CONVERT_SYNTAX; + break; + } + + src += len; /* * Need to handle this in a way that won't cause misalignment by @@ -2991,6 +3414,7 @@ TableToUtfProc( const unsigned short *pageZero; TableEncodingData *dataPtr = (TableEncodingData *)clientData; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3014,30 +3438,47 @@ TableToUtfProc( if (prefixBytes[byte]) { src++; if (src >= srcEnd) { - src--; - result = TCL_CONVERT_MULTIBYTE; - break; + if (!(flags & TCL_ENCODING_END)) { + src--; + result = TCL_CONVERT_MULTIBYTE; + break; + } else if (PROFILE_STRICT(flags)) { + src--; + result = TCL_CONVERT_SYNTAX; + break; + } else if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + src--; /* See bug [bdcb5126c0] */ + result = TCL_CONVERT_MULTIBYTE; + break; + } + } else { + ch = toUnicode[byte][*((unsigned char *)src)]; } - ch = toUnicode[byte][*((unsigned char *) src)]; } else { ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } if (prefixBytes[byte]) { src--; } - ch = (Tcl_UniChar) byte; + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + ch = (Tcl_UniChar)byte; + } } /* * Special case for 1-byte utf chars for speed. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -3107,6 +3548,7 @@ TableFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3139,11 +3581,11 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } - word = dataPtr->fallback; + word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */ } if (prefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { @@ -3213,6 +3655,7 @@ Iso88591ToUtfProc( const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3236,7 +3679,7 @@ Iso88591ToUtfProc( * Special case for 1-byte utf chars for speed. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -3297,6 +3740,7 @@ Iso88591FromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3327,7 +3771,7 @@ Iso88591FromUtfProc( || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3340,7 +3784,7 @@ Iso88591FromUtfProc( * Plunge on, using '?' as a fallback character. */ - ch = (Tcl_UniChar) '?'; + ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */ } if (dst > dstEnd) { @@ -3444,6 +3888,7 @@ EscapeToUtfProc( int state, result, numChars, charLimit = INT_MAX; const char *dstStart, *dstEnd; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3554,9 +3999,10 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if (!STOPONERROR) { + if (!PROFILE_STRICT(flags)) { /* - * Skip the unknown escape sequence. + * Skip the unknown escape sequence. TODO - bug? + * May be replace with UNICODE_REPLACE_CHAR? */ src += longest; @@ -3666,6 +4112,7 @@ EscapeFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3729,7 +4176,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3980,12 +4427,11 @@ unilen4( static void InitializeEncodingSearchPath( char **valuePtr, - size_t *lengthPtr, + TCL_HASH_TYPE *lengthPtr, Tcl_Encoding *encodingPtr) { const char *bytes; - size_t i, numDirs; - size_t numBytes; + Tcl_Size i, numDirs, numBytes; Tcl_Obj *libPathObj, *encodingObj, *searchPathObj; TclNewLiteralStringObj(encodingObj, "encoding"); @@ -4024,6 +4470,166 @@ InitializeEncodingSearchPath( } /* + *------------------------------------------------------------------------ + * + * TclEncodingProfileParseName -- + * + * Maps an encoding profile name to its integer equivalent. + * + * Results: + * TCL_OK on success or TCL_ERROR on failure. + * + * Side effects: + * Returns the profile enum value in *profilePtr + * + *------------------------------------------------------------------------ + */ +int +TclEncodingProfileNameToId( + Tcl_Interp *interp, /* For error messages. May be NULL */ + const char *profileName, /* Name of profile */ + int *profilePtr) /* Output */ +{ + size_t i; + size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + + for (i = 0; i < numProfiles; ++i) { + if (!strcmp(profileName, encodingProfiles[i].name)) { + *profilePtr = encodingProfiles[i].value; + return TCL_OK; + } + } + if (interp) { + Tcl_Obj *errorObj; + /* This code assumes at least two profiles :-) */ + errorObj = + Tcl_ObjPrintf("bad profile name \"%s\": must be", + profileName); + for (i = 0; i < (numProfiles - 1); ++i) { + Tcl_AppendStringsToObj( + errorObj, " ", encodingProfiles[i].name, ",", NULL); + } + Tcl_AppendStringsToObj( + errorObj, " or ", encodingProfiles[numProfiles-1].name, NULL); + + Tcl_SetObjResult(interp, errorObj); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILE", profileName, NULL); + } + return TCL_ERROR; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingProfileValueToName -- + * + * Maps an encoding profile value to its name. + * + * Results: + * Pointer to the name or NULL on failure. Caller must not make + * not modify the string and must make a copy to hold on to it. + * + * Side effects: + * None. + *------------------------------------------------------------------------ + */ +const char * +TclEncodingProfileIdToName( + Tcl_Interp *interp, /* For error messages. May be NULL */ + int profileValue) /* Profile #define value */ +{ + size_t i; + + for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + if (profileValue == encodingProfiles[i].value) { + return encodingProfiles[i].name; + } + } + if (interp) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "Internal error. Bad profile id \"%d\".", + profileValue)); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILEID", NULL); + } + return NULL; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingSetProfileFlags -- + * + * Maps the flags supported in the encoding C API's to internal flags. + * + * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is + * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile + * specified. + * + * If no profile or an invalid profile is specified, it is set to + * the default. + * + * Results: + * Internal encoding flag mask. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------ + */ +int TclEncodingSetProfileFlags(int flags) +{ + if (flags & TCL_ENCODING_STOPONERROR) { + CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT); + } else { + int profile = CHANNEL_PROFILE_GET(flags); + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: + case TCL_ENCODING_PROFILE_STRICT: + case TCL_ENCODING_PROFILE_REPLACE: + break; + case 0: /* Unspecified by caller */ + default: + CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT); + break; + } + } + return flags; +} + +/* + *------------------------------------------------------------------------ + * + * TclGetEncodingProfiles -- + * + * Get the list of supported encoding profiles. + * + * Results: + * None. + * + * Side effects: + * The list of profile names is stored in the interpreter result. + * + *------------------------------------------------------------------------ + */ +void +TclGetEncodingProfiles(Tcl_Interp *interp) +{ + size_t i, n; + Tcl_Obj *objPtr; + n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + objPtr = Tcl_NewListObj(n, NULL); + for (i = 0; i < n; ++i) { + Tcl_ListObjAppendElement( + interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE)); + } + Tcl_SetObjResult(interp, objPtr); +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 |
