diff options
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 791 |
1 files changed, 631 insertions, 160 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 61e3236..2a96383 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -188,6 +188,36 @@ static Tcl_Encoding systemEncoding = NULL; Tcl_Encoding tclIdentityEncoding = NULL; /* + * Names of encoding profiles and corresponding integer values. + * Keep alphabetical order for error messages. + */ +static struct TclEncodingProfiles { + const char *name; + int value; +} encodingProfiles[] = { + {"replace", TCL_ENCODING_PROFILE_REPLACE}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, +}; +#define PROFILE_TCL8(flags_) \ + ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8) \ + || (CHANNEL_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_TCL8)) +#define PROFILE_STRICT(flags_) \ + ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ + || (CHANNEL_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) +#define PROFILE_REPLACE(flags_) \ + ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) \ + || (CHANNEL_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE)) + +#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD) +#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) +#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) +#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) + +/* * The following variable is used in the sparse matrix code for a * TableEncoding to represent a page in the table that has no entries. */ @@ -230,6 +260,7 @@ static Tcl_EncodingConvertProc UtfToUtfProc; static Tcl_EncodingConvertProc Iso88591FromUtfProc; static Tcl_EncodingConvertProc Iso88591ToUtfProc; + /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field * of the internalrep. This should help the lifetime of encodings be more useful. @@ -522,11 +553,16 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and - * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ -#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ +/* + * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS + * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this + * when adding bits. TODO - should really be defined in a single file. + * + * To prevent conflicting bits, only define bits within 0xff00 mask here. + */ +#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */ #define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ -#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ +#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ void TclInitEncodingSubsystem(void) @@ -539,12 +575,16 @@ TclInitEncodingSubsystem(void) char c; short s; } isLe; + int leFlags; if (encodingsInitialized) { return; } - isLe.s = TCL_ENCODING_LE; + /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */ + isLe.s = 1; + leFlags = isLe.c ? TCL_ENCODING_LE : 0; + Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); @@ -585,7 +625,7 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(leFlags); Tcl_CreateEncoding(&type); type.toUtfProc = Utf32ToUtfProc; @@ -599,7 +639,7 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "utf-32"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(leFlags); Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; @@ -613,7 +653,7 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(ENCODING_UTF); Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; - type.clientData = INT2PTR(isLe.c|ENCODING_UTF); + type.clientData = INT2PTR(leFlags|ENCODING_UTF); Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED @@ -897,7 +937,7 @@ Tcl_GetEncodingNames( Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); Tcl_CreateHashEntry(&table, - Tcl_NewStringObj(encodingPtr->name, -1), &dummy); + Tcl_NewStringObj(encodingPtr->name, TCL_INDEX_NONE), &dummy); } Tcl_MutexUnlock(&encodingMutex); @@ -1114,7 +1154,8 @@ Tcl_ExternalToUtfDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_ExternalToUtfDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1128,34 +1169,53 @@ Tcl_ExternalToUtfDString( * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in utf-8. * Possible flags values: - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * Any other flag bits will cause an error to be returned (for future + * compatibility) * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. - * + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. + * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner. + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ -Tcl_Size +int Tcl_ExternalToUtfDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; @@ -1164,14 +1224,27 @@ Tcl_ExternalToUtfDStringEx( Tcl_Size dstLen, soFar; const char *srcStart = src; + /* DO FIRST - Must always be initialized before returning */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetObjResult( + interp, + Tcl_NewStringObj( + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_INDEX_NONE)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; if (encoding == NULL) { encoding = systemEncoding; } - encodingPtr = (Encoding *) encoding; + encodingPtr = (Encoding *)encoding; if (src == NULL) { srcLen = 0; @@ -1213,8 +1286,31 @@ Tcl_ExternalToUtfDStringEx( */ if ((result != TCL_CONVERT_NOSPACE) && !(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) { + Tcl_Size nBytesProcessed = (src - srcStart); + Tcl_DStringSetLength(dstPtr, soFar); - return (result == TCL_OK) ? TCL_INDEX_NONE : (Tcl_Size)(src - srcStart); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + char buf[TCL_INTEGER_SPACE]; + snprintf(buf, sizeof(buf), "%" TCL_Z_MODIFIER "u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf("unexpected byte sequence starting at index %" + TCL_Z_MODIFIER "u: '\\x%02X'", + nBytesProcessed, + UCHAR(srcStart[nBytesProcessed]))); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL); + } + } + return result; } flags &= ~TCL_ENCODING_START; @@ -1319,6 +1415,9 @@ Tcl_ExternalToUtf( } if (!noTerminate) { + if ((int) dstLen < 1) { + return TCL_CONVERT_NOSPACE; + } /* * If there are any null characters in the middle of the buffer, * they will converted to the UTF-8 null character (\xC0\x80). To get @@ -1327,6 +1426,10 @@ Tcl_ExternalToUtf( */ dstLen--; + } else { + if (dstLen <= 0 && srcLen > 0) { + return TCL_CONVERT_NOSPACE; + } } if (encodingPtr->toUtfProc == UtfToUtfProc) { flags |= ENCODING_INPUT; @@ -1382,7 +1485,8 @@ Tcl_UtfToExternalDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_UtfToExternalDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1395,45 +1499,73 @@ Tcl_UtfToExternalDString( * Convert a source buffer from UTF-8 to the specified encoding. * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in the - * target encoding. - * Possible flags values: - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ -Tcl_Size +int Tcl_UtfToExternalDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; int result; - Tcl_Size dstLen, soFar; const char *srcStart = src; + Tcl_Size dstLen, soFar; + /* DO FIRST - must always be initialized on return */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetObjResult( + interp, + Tcl_NewStringObj( + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_INDEX_NONE)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; @@ -1447,6 +1579,7 @@ Tcl_UtfToExternalDStringEx( } else if (srcLen == TCL_INDEX_NONE) { srcLen = strlen(src); } + flags |= TCL_ENCODING_START; while (1) { int srcChunkLen, srcChunkRead; @@ -1461,8 +1594,8 @@ Tcl_UtfToExternalDStringEx( dstChunkLen = dstLen > INT_MAX ? INT_MAX : dstLen; result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, - srcChunkLen, flags, &state, dst, dstChunkLen, - &srcChunkRead, &dstChunkWrote, &dstChunkChars); + srcChunkLen, flags, &state, dst, dstChunkLen, + &srcChunkRead, &dstChunkWrote, &dstChunkChars); soFar = dst + dstChunkWrote - Tcl_DStringValue(dstPtr); /* Move past the part processed in this go around */ @@ -1478,12 +1611,38 @@ Tcl_UtfToExternalDStringEx( */ if ((result != TCL_CONVERT_NOSPACE) && !(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) { + Tcl_Size nBytesProcessed = (src - srcStart); size_t i = soFar + encodingPtr->nullSize - 1; /* Loop as DStringSetLength only stores one nul byte at a time */ while (i+1 >= soFar+1) { Tcl_DStringSetLength(dstPtr, i--); } - return (result == TCL_OK) ? TCL_INDEX_NONE : (Tcl_Size)(src - srcStart); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + Tcl_Size pos = Tcl_NumUtfChars(srcStart, nBytesProcessed); + int ucs4; + char buf[TCL_INTEGER_SPACE]; + TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4); + snprintf(buf, sizeof(buf), "%" TCL_Z_MODIFIER "u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "unexpected character at index %" TCL_Z_MODIFIER + "u: 'U+%06X'", + pos, + ucs4)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", + buf, NULL); + } + } + return result; } flags &= ~TCL_ENCODING_START; @@ -1581,10 +1740,17 @@ Tcl_UtfToExternal( dstCharsPtr = &dstChars; } + if (dstLen < encodingPtr->nullSize) { + return TCL_CONVERT_NOSPACE; + } dstLen -= encodingPtr->nullSize; result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); + /* + * Buffer is terminated irrespective of result. Not sure this is + * reasonable but keep for historical/compatibility reasons. + */ memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize); return result; @@ -1644,7 +1810,7 @@ OpenEncodingFileChannel( const char *name) /* The name of the encoding file on disk and * also the name for new encoding. */ { - Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1); + Tcl_Obj *nameObj = Tcl_NewStringObj(name, TCL_INDEX_NONE); Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj); Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath()); Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap); @@ -1654,7 +1820,7 @@ OpenEncodingFileChannel( TclListObjGetElementsM(NULL, searchPath, &numDirs, &dir); Tcl_IncrRefCount(nameObj); - Tcl_AppendToObj(fileNameObj, ".enc", -1); + Tcl_AppendToObj(fileNameObj, ".enc", TCL_INDEX_NONE); Tcl_IncrRefCount(fileNameObj); Tcl_DictObjGet(NULL, map, nameObj, &directory); @@ -2286,6 +2452,7 @@ BinaryProc( if (dstLen < 0) { dstLen = 0; } + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { srcLen = *dstCharsPtr; } @@ -2319,14 +2486,12 @@ BinaryProc( *------------------------------------------------------------------------- */ -#define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN)) - static int UtfToUtfProc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ + void *clientData, /* additional flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ + int flags, /* TCL_ENCODING_* conversion control flags. */ TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ @@ -2348,12 +2513,14 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int ch; + int profile; result = TCL_OK; srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= 6; } @@ -2365,7 +2532,9 @@ UtfToUtfProc( flags |= PTR2INT(clientData); dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); + profile = CHANNEL_PROFILE_GET(flags); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the @@ -2382,29 +2551,35 @@ UtfToUtfProc( if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) { /* * Copy 7bit characters, but skip null-bytes when we are in input - * mode, so that they get converted to 0xC080. + * mode, so that they get converted to \xC0\x80. */ - *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED) && (!(flags & ENCODING_INPUT) - || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - || (flags & ENCODING_FAILINDEX))) { - /* - * If in input mode, and -strict or -failindex is specified: This is an error. - */ - if ((STOPONERROR) && (flags & ENCODING_INPUT)) { - result = TCL_CONVERT_SYNTAX; - break; + } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && + (UCHAR(src[1]) == 0x80) && + (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || + PROFILE_REPLACE(profile))) { + /* Special sequence \xC0\x80 */ + if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) { + if (PROFILE_REPLACE(profile)) { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + src += 2; + } else { + /* PROFILE_STRICT */ + result = TCL_CONVERT_SYNTAX; + break; + } + } else { + /* + * Convert 0xC080 to real nulls when we are in output mode, + * irrespective of the profile. + */ + *dst++ = 0; + src += 2; } - /* - * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'. - */ - *dst++ = 0; - src += 2; } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* + * Incomplete byte sequence. * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves @@ -2412,32 +2587,43 @@ UtfToUtfProc( */ if (flags & ENCODING_INPUT) { - if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { - result = TCL_CONVERT_MULTIBYTE; - break; - } - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX)) { - result = TCL_CONVERT_SYNTAX; + /* Incomplete bytes for modified UTF-8 target */ + if (PROFILE_STRICT(profile)) { + result = (flags & TCL_ENCODING_CHAR_LIMIT) + ? TCL_CONVERT_MULTIBYTE + : TCL_CONVERT_SYNTAX; break; } } - char chbuf[2]; - chbuf[0] = UCHAR(*src++); chbuf[1] = 0; - TclUtfToUCS4(chbuf, &ch); + if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + ++src; + } else { + /* TCL_ENCODING_PROFILE_TCL8 */ + char chbuf[2]; + chbuf[0] = UCHAR(*src++); chbuf[1] = 0; + TclUtfToUCS4(chbuf, &ch); + } dst += Tcl_UniCharToUtf(ch, dst); } else { + int isInvalid = 0; size_t len = TclUtfToUCS4(src, &ch); if (flags & ENCODING_INPUT) { - if ((len < 2) && (ch != 0) - && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) { - goto utf8Syntax; - } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF) - && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) { - utf8Syntax: - result = TCL_CONVERT_SYNTAX; - break; + if ((len < 2) && (ch != 0)) { + isInvalid = 1; + } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) { + isInvalid = 1; + } + if (isInvalid) { + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_SYNTAX; + break; + } else if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } } } + const char *saveSrc = src; src += len; if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) { @@ -2457,38 +2643,45 @@ UtfToUtfProc( *dst++ = (char) ((ch | 0x80) & 0xBF); continue; #if TCL_UTF_MAX < 4 - } else if ((ch | 0x7FF) == 0xDFFF) { + } else if (SURROGATE(ch)) { /* * A surrogate character is detected, handle especially. */ - - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) && (flags & ENCODING_UTF)) { + if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } - int low = ch; - len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - - if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - - if (STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; + if (PROFILE_REPLACE(profile)) { + /* TODO - is this right for cesu8 or should we fall through below? */ + ch = UNICODE_REPLACE_CHAR; + } else { + int low = ch; + len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0; + + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { + + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + goto cesu8; } - goto cesu8; + src += len; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } - src += len; - dst += Tcl_UniCharToUtf(ch, dst); - ch = low; #endif - } else if (STOPONERROR && !(flags & ENCODING_INPUT) && (((ch & ~0x7FF) == 0xD800))) { + } else if (PROFILE_STRICT(profile) && + (!(flags & ENCODING_INPUT)) && + SURROGATE(ch)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && (flags & ENCODING_INPUT) && ((ch & ~0x7FF) == 0xD800)) { + } else if (PROFILE_STRICT(profile) && + (flags & ENCODING_INPUT) && + SURROGATE(ch)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; break; @@ -2545,8 +2738,9 @@ Utf32ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - int ch, bytesLeft = srcLen % 4; + int ch = 0, bytesLeft = srcLen % 4; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; @@ -2556,12 +2750,27 @@ Utf32ToUtfProc( /* * Check alignment with utf-32 (4 == sizeof(UTF-32)) */ - if (bytesLeft != 0) { + /* We have a truncated code unit */ result = TCL_CONVERT_MULTIBYTE; srcLen -= bytesLeft; } +#if TCL_UTF_MAX < 4 + /* + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. + */ + + if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) { + result = TCL_CONVERT_MULTIBYTE; + srcLen-= 4; + } +#endif + srcStart = src; srcEnd = src + srcLen; @@ -2574,23 +2783,35 @@ Utf32ToUtfProc( break; } +#if TCL_UTF_MAX < 4 + int prev = ch; +#endif if (flags & TCL_ENCODING_LE) { ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } - if ((unsigned)ch > 0x10FFFF) { - if (STOPONERROR) { - result = TCL_CONVERT_SYNTAX; - break; - } - ch = 0xFFFD; - } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && ((ch & ~0x7FF) == 0xD800)) { - if (STOPONERROR) { +#if TCL_UTF_MAX < 4 + if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } +#endif + + if ((unsigned)ch > 0x10FFFF) { + ch = UNICODE_REPLACE_CHAR; + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } + } else if (PROFILE_STRICT(flags) && SURROGATE(ch)) { + result = TCL_CONVERT_SYNTAX; +#if TCL_UTF_MAX < 4 + ch = 0; +#endif + break; + } else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) { + ch = UNICODE_REPLACE_CHAR; } /* @@ -2598,30 +2819,41 @@ Utf32ToUtfProc( * unsigned short-size data. */ - if ((ch > 0) && (ch < 0x80)) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned int); + src += 4; + } + + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + */ +#if TCL_UTF_MAX < 4 + if (HIGH_SURROGATE(ch)) { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); } +#endif if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - /* We have a single byte left-over at the end */ if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - /* destination is not full, so we really are at the end now */ - if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ result = TCL_OK; - dst += Tcl_UniCharToUtf(0xFFFD, dst); + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); numChars++; - src += bytesLeft; + src += bytesLeft; /* Go past truncated code unit */ } } } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2674,6 +2906,7 @@ UtfToUtf32Proc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -2698,11 +2931,14 @@ UtfToUtf32Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -2768,6 +3004,7 @@ Utf16ToUtfProc( int result, numChars, charLimit = INT_MAX; unsigned short ch = 0; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; @@ -2800,7 +3037,7 @@ Utf16ToUtfProc( dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; @@ -2812,9 +3049,30 @@ Utf16ToUtfProc( } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } - if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) { + if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; /* Go back to beginning of high surrogate */ + dst--; /* Also undo writing a single byte too much */ + numChars--; + break; + } else if (PROFILE_REPLACE(flags)) { + /* + * Previous loop wrote a single byte to mark the high surrogate. + * Replace it with the replacement character. Further, restart + * current loop iteration since need to recheck destination space + * and reset processing of current character. + */ + ch = UNICODE_REPLACE_CHAR; + dst--; + dst += Tcl_UniCharToUtf(ch, dst); + src -= 2; + numChars--; + continue; + } else { /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); + dst += Tcl_UniCharToUtf(-1, dst); + } } /* @@ -2822,34 +3080,59 @@ Utf16ToUtfProc( * unsigned short-size data. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (ch & 0xFF); - } else { + } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) { dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); + } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) { + /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */ + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; + } else { + /* PROFILE_REPLACE */ + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + } + } else { + dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned short); } - if ((ch & ~0x3FF) == 0xD800) { - /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); + if (HIGH_SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; + dst--; + numChars--; + } else if (PROFILE_REPLACE(flags)) { + dst--; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + } else { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } } + + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + */ if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - /* We have a single byte left-over at the end */ if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - /* destination is not full, so we really are at the end now */ - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ result = TCL_OK; - dst += Tcl_UniCharToUtf(0xFFFD, dst); + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); numChars++; - src++; + src++; /* Go past truncated code unit */ } } } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2902,6 +3185,7 @@ UtfToUtf16Proc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -2926,11 +3210,14 @@ UtfToUtf16Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -3004,6 +3291,7 @@ UtfToUcs2Proc( int result, numChars, len; Tcl_UniChar ch = 0; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); srcStart = src; srcEnd = src + srcLen; @@ -3033,25 +3321,25 @@ UtfToUcs2Proc( #if TCL_UTF_MAX < 4 len = TclUtfToUniChar(src, &ch); if ((ch >= 0xD800) && (len < 3)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } src += len; src += TclUtfToUniChar(src, &ch); - ch = 0xFFFD; + ch = UNICODE_REPLACE_CHAR; } #else len = TclUtfToUniChar(src, &ch); if (ch > 0xFFFF) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } - ch = 0xFFFD; + ch = UNICODE_REPLACE_CHAR; } #endif - if (STOPONERROR && ((ch & ~0x7FF) == 0xD800)) { + if (PROFILE_STRICT(flags) && SURROGATE(ch)) { result = TCL_CONVERT_SYNTAX; break; } @@ -3126,6 +3414,7 @@ TableToUtfProc( const unsigned short *pageZero; TableEncodingData *dataPtr = (TableEncodingData *)clientData; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3149,31 +3438,47 @@ TableToUtfProc( if (prefixBytes[byte]) { src++; if (src >= srcEnd) { - src--; - result = TCL_CONVERT_MULTIBYTE; - break; + if (!(flags & TCL_ENCODING_END)) { + src--; + result = TCL_CONVERT_MULTIBYTE; + break; + } else if (PROFILE_STRICT(flags)) { + src--; + result = TCL_CONVERT_SYNTAX; + break; + } else if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + src--; /* See bug [bdcb5126c0] */ + result = TCL_CONVERT_MULTIBYTE; + break; + } + } else { + ch = toUnicode[byte][*((unsigned char *)src)]; } - ch = toUnicode[byte][*((unsigned char *) src)]; } else { ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if ((flags & ENCODING_FAILINDEX) - || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } if (prefixBytes[byte]) { src--; } - ch = (Tcl_UniChar) byte; + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + ch = (Tcl_UniChar)byte; + } } /* * Special case for 1-byte utf chars for speed. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -3243,6 +3548,7 @@ TableFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3275,11 +3581,11 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } - word = dataPtr->fallback; + word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */ } if (prefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { @@ -3349,6 +3655,7 @@ Iso88591ToUtfProc( const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3372,7 +3679,7 @@ Iso88591ToUtfProc( * Special case for 1-byte utf chars for speed. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -3433,6 +3740,7 @@ Iso88591FromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3463,7 +3771,7 @@ Iso88591FromUtfProc( || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3476,7 +3784,7 @@ Iso88591FromUtfProc( * Plunge on, using '?' as a fallback character. */ - ch = (Tcl_UniChar) '?'; + ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */ } if (dst > dstEnd) { @@ -3580,6 +3888,7 @@ EscapeToUtfProc( int state, result, numChars, charLimit = INT_MAX; const char *dstStart, *dstEnd; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3690,9 +3999,10 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if (!STOPONERROR) { + if (!PROFILE_STRICT(flags)) { /* - * Skip the unknown escape sequence. + * Skip the unknown escape sequence. TODO - bug? + * May be replace with UNICODE_REPLACE_CHAR? */ src += longest; @@ -3802,6 +4112,7 @@ EscapeFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3865,7 +4176,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -4159,6 +4470,166 @@ InitializeEncodingSearchPath( } /* + *------------------------------------------------------------------------ + * + * TclEncodingProfileParseName -- + * + * Maps an encoding profile name to its integer equivalent. + * + * Results: + * TCL_OK on success or TCL_ERROR on failure. + * + * Side effects: + * Returns the profile enum value in *profilePtr + * + *------------------------------------------------------------------------ + */ +int +TclEncodingProfileNameToId( + Tcl_Interp *interp, /* For error messages. May be NULL */ + const char *profileName, /* Name of profile */ + int *profilePtr) /* Output */ +{ + size_t i; + size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + + for (i = 0; i < numProfiles; ++i) { + if (!strcmp(profileName, encodingProfiles[i].name)) { + *profilePtr = encodingProfiles[i].value; + return TCL_OK; + } + } + if (interp) { + Tcl_Obj *errorObj; + /* This code assumes at least two profiles :-) */ + errorObj = + Tcl_ObjPrintf("bad profile name \"%s\": must be", + profileName); + for (i = 0; i < (numProfiles - 1); ++i) { + Tcl_AppendStringsToObj( + errorObj, " ", encodingProfiles[i].name, ",", NULL); + } + Tcl_AppendStringsToObj( + errorObj, " or ", encodingProfiles[numProfiles-1].name, NULL); + + Tcl_SetObjResult(interp, errorObj); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILE", profileName, NULL); + } + return TCL_ERROR; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingProfileValueToName -- + * + * Maps an encoding profile value to its name. + * + * Results: + * Pointer to the name or NULL on failure. Caller must not make + * not modify the string and must make a copy to hold on to it. + * + * Side effects: + * None. + *------------------------------------------------------------------------ + */ +const char * +TclEncodingProfileIdToName( + Tcl_Interp *interp, /* For error messages. May be NULL */ + int profileValue) /* Profile #define value */ +{ + size_t i; + + for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + if (profileValue == encodingProfiles[i].value) { + return encodingProfiles[i].name; + } + } + if (interp) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "Internal error. Bad profile id \"%d\".", + profileValue)); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILEID", NULL); + } + return NULL; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingSetProfileFlags -- + * + * Maps the flags supported in the encoding C API's to internal flags. + * + * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is + * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile + * specified. + * + * If no profile or an invalid profile is specified, it is set to + * the default. + * + * Results: + * Internal encoding flag mask. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------ + */ +int TclEncodingSetProfileFlags(int flags) +{ + if (flags & TCL_ENCODING_STOPONERROR) { + CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT); + } else { + int profile = CHANNEL_PROFILE_GET(flags); + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: + case TCL_ENCODING_PROFILE_STRICT: + case TCL_ENCODING_PROFILE_REPLACE: + break; + case 0: /* Unspecified by caller */ + default: + CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT); + break; + } + } + return flags; +} + +/* + *------------------------------------------------------------------------ + * + * TclGetEncodingProfiles -- + * + * Get the list of supported encoding profiles. + * + * Results: + * None. + * + * Side effects: + * The list of profile names is stored in the interpreter result. + * + *------------------------------------------------------------------------ + */ +void +TclGetEncodingProfiles(Tcl_Interp *interp) +{ + size_t i, n; + Tcl_Obj *objPtr; + n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + objPtr = Tcl_NewListObj(n, NULL); + for (i = 0; i < n; ++i) { + Tcl_ListObjAppendElement( + interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE)); + } + Tcl_SetObjResult(interp, objPtr); +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 |