diff options
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 531 |
1 files changed, 444 insertions, 87 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 72b6ee3..0ce75b4 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -37,7 +37,7 @@ typedef struct { * end-of-string in this encoding. This number * is used to determine the source string * length when the srcLen argument is - * negative. This number can be 1 or 2. */ + * negative. This number can be 1, 2, or 4. */ ClientData clientData; /* Arbitrary value associated with encoding * type. Passed to conversion functions. */ LengthProc *lengthProc; /* Function to compute length of @@ -45,7 +45,9 @@ typedef struct { * If nullSize is 1, this is strlen; if * nullSize is 2, this is a function that * returns the number of bytes in a 0x0000 - * terminated string. */ + * terminated string; if nullSize is 4, this + * is a function that returns the number of + * bytes in a 0x00000000 terminated string. */ size_t refCount; /* Number of uses of this structure. */ Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */ } Encoding; @@ -196,13 +198,13 @@ static unsigned short emptyPage[256]; */ static Tcl_EncodingConvertProc BinaryProc; -static Tcl_DupInternalRepProc DupEncodingIntRep; +static Tcl_DupInternalRepProc DupEncodingInternalRep; static Tcl_EncodingFreeProc EscapeFreeProc; static Tcl_EncodingConvertProc EscapeFromUtfProc; static Tcl_EncodingConvertProc EscapeToUtfProc; static void FillEncodingFileMap(void); static void FreeEncoding(Tcl_Encoding encoding); -static Tcl_FreeInternalRepProc FreeEncodingIntRep; +static Tcl_FreeInternalRepProc FreeEncodingInternalRep; static Encoding * GetTableEncoding(EscapeEncodingData *dataPtr, int state); static Tcl_Encoding LoadEncodingFile(Tcl_Interp *interp, @@ -216,7 +218,10 @@ static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, static Tcl_EncodingFreeProc TableFreeProc; static Tcl_EncodingConvertProc TableFromUtfProc; static Tcl_EncodingConvertProc TableToUtfProc; -static size_t unilen(const char *src); +static size_t unilen(const char *src); +static size_t unilen4(const char *src); +static Tcl_EncodingConvertProc Utf32ToUtfProc; +static Tcl_EncodingConvertProc UtfToUtf32Proc; static Tcl_EncodingConvertProc Utf16ToUtfProc; static Tcl_EncodingConvertProc UtfToUtf16Proc; static Tcl_EncodingConvertProc UtfToUcs2Proc; @@ -226,25 +231,25 @@ static Tcl_EncodingConvertProc Iso88591ToUtfProc; /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field - * of the intrep. This should help the lifetime of encodings be more useful. + * of the internalrep. This should help the lifetime of encodings be more useful. * See concerns raised in [Bug 1077262]. */ static const Tcl_ObjType encodingType = { - "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL + "encoding", FreeEncodingInternalRep, DupEncodingInternalRep, NULL, NULL }; -#define EncodingSetIntRep(objPtr, encoding) \ +#define EncodingSetInternalRep(objPtr, encoding) \ do { \ - Tcl_ObjIntRep ir; \ + Tcl_ObjInternalRep ir; \ ir.twoPtrValue.ptr1 = (encoding); \ ir.twoPtrValue.ptr2 = NULL; \ - Tcl_StoreIntRep((objPtr), &encodingType, &ir); \ + Tcl_StoreInternalRep((objPtr), &encodingType, &ir); \ } while (0) -#define EncodingGetIntRep(objPtr, encoding) \ +#define EncodingGetInternalRep(objPtr, encoding) \ do { \ - const Tcl_ObjIntRep *irPtr; \ - irPtr = TclFetchIntRep ((objPtr), &encodingType); \ + const Tcl_ObjInternalRep *irPtr; \ + irPtr = TclFetchInternalRep ((objPtr), &encodingType); \ (encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL; \ } while (0) @@ -277,13 +282,13 @@ Tcl_GetEncodingFromObj( Tcl_Encoding encoding; const char *name = TclGetString(objPtr); - EncodingGetIntRep(objPtr, encoding); + EncodingGetInternalRep(objPtr, encoding); if (encoding == NULL) { encoding = Tcl_GetEncoding(interp, name); if (encoding == NULL) { return TCL_ERROR; } - EncodingSetIntRep(objPtr, encoding); + EncodingSetInternalRep(objPtr, encoding); } *encodingPtr = Tcl_GetEncoding(NULL, name); return TCL_OK; @@ -292,7 +297,7 @@ Tcl_GetEncodingFromObj( /* *---------------------------------------------------------------------- * - * FreeEncodingIntRep -- + * FreeEncodingInternalRep -- * * The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType. * @@ -300,19 +305,19 @@ Tcl_GetEncodingFromObj( */ static void -FreeEncodingIntRep( +FreeEncodingInternalRep( Tcl_Obj *objPtr) { Tcl_Encoding encoding; - EncodingGetIntRep(objPtr, encoding); + EncodingGetInternalRep(objPtr, encoding); Tcl_FreeEncoding(encoding); } /* *---------------------------------------------------------------------- * - * DupEncodingIntRep -- + * DupEncodingInternalRep -- * * The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType. * @@ -320,12 +325,12 @@ FreeEncodingIntRep( */ static void -DupEncodingIntRep( +DupEncodingInternalRep( Tcl_Obj *srcPtr, Tcl_Obj *dupPtr) { Tcl_Encoding encoding = Tcl_GetEncoding(NULL, TclGetString(srcPtr)); - EncodingSetIntRep(dupPtr, encoding); + EncodingSetInternalRep(dupPtr, encoding); } /* @@ -365,7 +370,7 @@ Tcl_SetEncodingSearchPath( { int dummy; - if (TCL_ERROR == Tcl_ListObjLength(NULL, searchPath, &dummy)) { + if (TCL_ERROR == TclListObjLengthM(NULL, searchPath, &dummy)) { return TCL_ERROR; } TclSetProcessGlobalValue(&encodingSearchPath, searchPath, NULL); @@ -412,7 +417,7 @@ TclSetLibraryPath( { int dummy; - if (TCL_ERROR == Tcl_ListObjLength(NULL, path, &dummy)) { + if (TCL_ERROR == TclListObjLengthM(NULL, path, &dummy)) { return; } TclSetProcessGlobalValue(&libraryPath, path, NULL); @@ -451,7 +456,7 @@ FillEncodingFileMap(void) searchPath = Tcl_GetEncodingSearchPath(); Tcl_IncrRefCount(searchPath); - Tcl_ListObjLength(NULL, searchPath, &numDirs); + TclListObjLengthM(NULL, searchPath, &numDirs); map = Tcl_NewDictObj(); Tcl_IncrRefCount(map); @@ -475,7 +480,7 @@ FillEncodingFileMap(void) Tcl_FSMatchInDirectory(NULL, matchFileList, directory, "*.enc", &readableFiles); - Tcl_ListObjGetElements(NULL, matchFileList, &numFiles, &filev); + TclListObjGetElementsM(NULL, matchFileList, &numFiles, &filev); for (j=0; j<numFiles; j++) { Tcl_Obj *encodingName, *fileObj; @@ -510,11 +515,10 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* This flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ -#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8 and - * TCL_ENCODING_LE is only used for utf-16/ucs-2, re-use the same value */ +/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and + * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ +#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void TclInitEncodingSubsystem(void) @@ -556,7 +560,10 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(TCL_ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; @@ -573,6 +580,20 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); + type.toUtfProc = Utf32ToUtfProc; + type.fromUtfProc = UtfToUtf32Proc; + type.freeProc = NULL; + type.nullSize = 4; + type.encodingName = "utf-32le"; + type.clientData = INT2PTR(TCL_ENCODING_LE); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-32be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-32"; + type.clientData = INT2PTR(isLe.c); + Tcl_CreateEncoding(&type); + type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUtf16Proc; type.freeProc = NULL; @@ -705,7 +726,7 @@ Tcl_GetDefaultEncodingDir(void) int numDirs; Tcl_Obj *first, *searchPath = Tcl_GetEncodingSearchPath(); - Tcl_ListObjLength(NULL, searchPath, &numDirs); + TclListObjLengthM(NULL, searchPath, &numDirs); if (numDirs == 0) { return NULL; } @@ -1053,10 +1074,12 @@ Tcl_CreateEncoding( encodingPtr->freeProc = typePtr->freeProc; encodingPtr->nullSize = typePtr->nullSize; encodingPtr->clientData = typePtr->clientData; - if (typePtr->nullSize == 1) { - encodingPtr->lengthProc = (LengthProc *) strlen; - } else { + if (typePtr->nullSize == 2) { encodingPtr->lengthProc = (LengthProc *) unilen; + } else if (typePtr->nullSize == 4) { + encodingPtr->lengthProc = (LengthProc *) unilen4; + } else { + encodingPtr->lengthProc = (LengthProc *) strlen; } encodingPtr->refCount = 1; encodingPtr->hPtr = NULL; @@ -1119,10 +1142,56 @@ Tcl_ExternalToUtfDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_ExternalToUtfDStringEx -- + * + * Convert a source buffer from the specified encoding into UTF-8. +* The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in utf-8. + * Possible flags values: + * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but + * return the first error position (Default in Tcl 9.0). + * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default + * fallback character. Always return -1 (Default in Tcl 8.7). + * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. + * Only valid for "utf-8" and "cesu-8". This flag may be used together + * with the other flags. + * + * Results: + * The converted bytes are stored in the DString, which is then NULL + * terminated in an encoding-specific manner. The return value is + * the error position in the source string or -1 if no conversion error + * is reported. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +int +Tcl_ExternalToUtfDStringEx( + Tcl_Encoding encoding, /* The encoding for the source string, or NULL + * for the default system encoding. */ + const char *src, /* Source string in specified encoding. */ + int srcLen, /* Source string length in bytes, or < 0 for + * encoding-specific string length. */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + * converted string is stored. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; + int dstLen, result, soFar, srcRead, dstWrote, dstChars; + const char *srcStart = src; Tcl_DStringInit(dstPtr); dst = Tcl_DStringValue(dstPtr); @@ -1139,9 +1208,9 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } while (1) { @@ -1152,7 +1221,7 @@ Tcl_ExternalToUtfDString( src += srcRead; if (result != TCL_CONVERT_NOSPACE) { Tcl_DStringSetLength(dstPtr, soFar); - return Tcl_DStringValue(dstPtr); + return (result == TCL_OK) ? TCL_INDEX_NONE : (int)(src - srcStart); } flags &= ~TCL_ENCODING_START; srcLen -= srcRead; @@ -1258,7 +1327,7 @@ Tcl_ExternalToUtf( dstLen--; } if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } do { Tcl_EncodingState savedState = *statePtr; @@ -1269,7 +1338,7 @@ Tcl_ExternalToUtf( if (*dstCharsPtr <= maxChars) { break; } - dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); + dstLen = TclUtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); *statePtr = savedState; } while (1); if (!noTerminate) { @@ -1311,10 +1380,57 @@ Tcl_UtfToExternalDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_UtfToExternalDStringEx -- + * + * Convert a source buffer from UTF-8 to the specified encoding. + * The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in the + * target encoding. + * Possible flags values: + * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but + * return the first error position (Default in Tcl 9.0). + * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default + * fallback character. Always return -1 (Default in Tcl 8.7). + * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. + * Only valid for "utf-8" and "cesu-8". This flag may be used together + * with the other flags. + * + * Results: + * The converted bytes are stored in the DString, which is then NULL + * terminated in an encoding-specific manner. The return value is + * the error position in the source string or -1 if no conversion error + * is reported. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +int +Tcl_UtfToExternalDStringEx( + Tcl_Encoding encoding, /* The encoding for the converted string, or + * NULL for the default system encoding. */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes, or < 0 for + * strlen(). */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + * converted string is stored. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; + int dstLen, result, soFar, srcRead, dstWrote, dstChars; + const char *srcStart = src; Tcl_DStringInit(dstPtr); dst = Tcl_DStringValue(dstPtr); @@ -1330,23 +1446,23 @@ Tcl_UtfToExternalDString( } else if (srcLen < 0) { srcLen = strlen(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + src += srcRead; if (result != TCL_CONVERT_NOSPACE) { - if (encodingPtr->nullSize == 2) { - Tcl_DStringSetLength(dstPtr, soFar + 1); + int i = soFar + encodingPtr->nullSize - 1; + while (i >= soFar) { + Tcl_DStringSetLength(dstPtr, i--); } - Tcl_DStringSetLength(dstPtr, soFar); - return Tcl_DStringValue(dstPtr); + return (result == TCL_OK) ? TCL_INDEX_NONE : (int)(src - srcStart); } flags &= ~TCL_ENCODING_START; - src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -1437,10 +1553,7 @@ Tcl_UtfToExternal( result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); - if (encodingPtr->nullSize == 2) { - dst[*dstWrotePtr + 1] = '\0'; - } - dst[*dstWrotePtr] = '\0'; + memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize); return result; } @@ -1463,14 +1576,15 @@ Tcl_UtfToExternal( *--------------------------------------------------------------------------- */ #undef Tcl_FindExecutable -void +const char * Tcl_FindExecutable( const char *argv0) /* The value of the application's argv[0] * (native). */ { - Tcl_InitSubsystems(); + const char *version = Tcl_InitSubsystems(); TclpSetInitialEncodings(); TclpFindExecutable(argv0); + return version; } /* @@ -1506,7 +1620,7 @@ OpenEncodingFileChannel( Tcl_Channel chan = NULL; int i, numDirs; - Tcl_ListObjGetElements(NULL, searchPath, &numDirs, &dir); + TclListObjGetElementsM(NULL, searchPath, &numDirs, &dir); Tcl_IncrRefCount(nameObj); Tcl_AppendToObj(fileNameObj, ".enc", -1); Tcl_IncrRefCount(fileNameObj); @@ -2173,6 +2287,12 @@ BinaryProc( *------------------------------------------------------------------------- */ +#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) +# define STOPONERROR !(flags & TCL_ENCODING_NOCOMPLAIN) +#else +# define STOPONERROR (flags & TCL_ENCODING_STOPONERROR) +#endif + static int UtfToUtfProc( ClientData clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ @@ -2215,7 +2335,7 @@ UtfToUtfProc( dstStart = dst; flags |= PTR2INT(clientData); - dstEnd = dst + dstLen - TCL_UTF_MAX; + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2231,15 +2351,15 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && (flags & TCL_ENCODING_MODIFIED))) { + if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) { /* * Copy 7bit characters, but skip null-bytes when we are in input * mode, so that they get converted to 0xC080. */ *dst++ = *src++; - } else if (UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) - && UCHAR(src[1]) == 0x80 && !(flags & TCL_ENCODING_MODIFIED)) { + } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) + && (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED)) { /* * Convert 0xC080 to real nulls when we are in output mode. */ @@ -2255,7 +2375,7 @@ UtfToUtfProc( */ if (flags & TCL_ENCODING_MODIFIED) { - if (flags & TCL_ENCODING_STOPONERROR) { + if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { result = TCL_CONVERT_MULTIBYTE; break; } @@ -2268,14 +2388,25 @@ UtfToUtfProc( dst += Tcl_UniCharToUtf(ch, dst); } else { int low; + const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); - if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR) + if ((len < 2) && (ch != 0) && STOPONERROR && (flags & TCL_ENCODING_MODIFIED)) { result = TCL_CONVERT_SYNTAX; break; } src += len; - if ((ch | 0x7FF) == 0xDFFF) { + if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { + if (ch > 0xFFFF) { + /* CESU-8 6-byte sequence for chars > U+FFFF */ + ch -= 0x10000; + *dst++ = 0xED; + *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); + *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); + ch = (ch & 0x0CFF) | 0xDC00; + } + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ @@ -2284,6 +2415,13 @@ UtfToUtfProc( len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + + if (STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); @@ -2292,6 +2430,15 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; + } else if (!Tcl_UniCharIsUnicode(ch)) { + if (STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } } dst += Tcl_UniCharToUtf(ch, dst); } @@ -2306,6 +2453,199 @@ UtfToUtfProc( /* *------------------------------------------------------------------------- * + * Utf32ToUtfProc -- + * + * Convert from UTF-32 to UTF-8. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +Utf32ToUtfProc( + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ + const char *src, /* Source string in Unicode. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + TCL_UNUSED(Tcl_EncodingState *), + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd; + const char *dstEnd, *dstStart; + int result, numChars, charLimit = INT_MAX; + int ch; + + flags |= PTR2INT(clientData); + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } + result = TCL_OK; + + /* + * Check alignment with utf-32 (4 == sizeof(UTF-32)) + */ + + if ((srcLen % 4) != 0) { + result = TCL_CONVERT_MULTIBYTE; + srcLen &= -4; + } + + srcStart = src; + srcEnd = src + srcLen; + + dstStart = dst; + dstEnd = dst + dstLen - TCL_UTF_MAX; + + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + + if (flags & TCL_ENCODING_LE) { + ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); + } + + /* + * Special case for 1-byte utf chars for speed. Make sure we work with + * unsigned short-size data. + */ + + if ((ch > 0) && (ch < 0x80)) { + *dst++ = (ch & 0xFF); + } else { + dst += Tcl_UniCharToUtf(ch, dst); + } + src += sizeof(unsigned int); + } + + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * + * UtfToUtf32Proc -- + * + * Convert from UTF-8 to UTF-32. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +UtfToUtf32Proc( + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + TCL_UNUSED(Tcl_EncodingState *), + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars; + int ch, len; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + flags |= PTR2INT(clientData); + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + len = TclUtfToUCS4(src, &ch); + if (!Tcl_UniCharIsUnicode(ch)) { + if (STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = 0xFFFD; + } + src += len; + if (flags & TCL_ENCODING_LE) { + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 24) & 0xFF); + } else { + *dst++ = ((ch >> 24) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = (ch & 0xFF); + } + } + + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * * Utf16ToUtfProc -- * * Convert from UTF-16 to UTF-8. @@ -2450,7 +2790,7 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - int ch; + int ch, len; srcStart = src; srcEnd = src + srcLen; @@ -2478,7 +2818,15 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUCS4(src, &ch); + len = TclUtfToUCS4(src, &ch); + if (!Tcl_UniCharIsUnicode(ch)) { + if (STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = 0xFFFD; + } + src += len; if (flags & TCL_ENCODING_LE) { if (ch <= 0xFFFF) { *dst++ = (ch & 0xFF); @@ -2548,7 +2896,7 @@ UtfToUcs2Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; -#if TCL_UTF_MAX <= 3 +#if TCL_UTF_MAX < 4 int len; #endif Tcl_UniChar ch = 0; @@ -2579,7 +2927,7 @@ UtfToUcs2Proc( result = TCL_CONVERT_NOSPACE; break; } -#if TCL_UTF_MAX <= 3 +#if TCL_UTF_MAX < 4 src += (len = TclUtfToUniChar(src, &ch)); if ((ch >= 0xD800) && (len < 3)) { src += TclUtfToUniChar(src, &ch); @@ -2610,7 +2958,7 @@ UtfToUcs2Proc( *dstCharsPtr = numChars; return result; } - + /* *------------------------------------------------------------------------- * @@ -2692,7 +3040,7 @@ TableToUtfProc( ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_SYNTAX; break; } @@ -2796,11 +3144,7 @@ TableFromUtfProc( len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX > 3 - /* - * This prevents a crash condition. More evaluation is required for - * full support of int Tcl_UniChar. [Bug 1004065] - */ - + /* Unicode chars > +U0FFFF cannot be represented in any table encoding */ if (ch & 0xFFFF0000) { word = 0; } else @@ -2812,7 +3156,7 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -2996,15 +3340,15 @@ Iso88591FromUtfProc( */ if (ch > 0xFF -#if TCL_UTF_MAX <= 3 +#if TCL_UTF_MAX < 4 || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } -#if TCL_UTF_MAX <= 3 +#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (len < 3)) { len = 4; } @@ -3052,7 +3396,7 @@ TableFreeProc( ClientData clientData) /* TableEncodingData that specifies * encoding. */ { - TableEncodingData *dataPtr = (TableEncodingData *) clientData; + TableEncodingData *dataPtr = (TableEncodingData *)clientData; /* * Make sure we aren't freeing twice on shutdown. [Bug 219314] @@ -3110,7 +3454,7 @@ EscapeToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; const unsigned short *const *tableToUnicode; const Encoding *encodingPtr; @@ -3227,7 +3571,7 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if ((flags & TCL_ENCODING_STOPONERROR) == 0) { + if (!STOPONERROR) { /* * Skip the unknown escape sequence. */ @@ -3402,7 +3746,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3587,7 +3931,7 @@ GetTableEncoding( /* *--------------------------------------------------------------------------- * - * unilen -- + * unilen, unilen4 -- * * A helper function for the Tcl_ExternalToUtf functions. This function * is similar to strlen for double-byte characters: it returns the number @@ -3614,6 +3958,19 @@ unilen( } return (char *) p - src; } + +static size_t +unilen4( + const char *src) +{ + unsigned int *p; + + p = (unsigned int *) src; + while (*p != 0x00000000) { + p++; + } + return (char *) p - src; +} /* *------------------------------------------------------------------------- @@ -3645,7 +4002,7 @@ InitializeEncodingSearchPath( Tcl_Encoding *encodingPtr) { const char *bytes; - int i, numDirs; + int i, numDirs, numBytes; Tcl_Obj *libPathObj, *encodingObj, *searchPathObj; TclNewLiteralStringObj(encodingObj, "encoding"); @@ -3654,7 +4011,7 @@ InitializeEncodingSearchPath( Tcl_IncrRefCount(searchPathObj); libPathObj = TclGetLibraryPath(); Tcl_IncrRefCount(libPathObj); - Tcl_ListObjLength(NULL, libPathObj, &numDirs); + TclListObjLengthM(NULL, libPathObj, &numDirs); for (i = 0; i < numDirs; i++) { Tcl_Obj *directoryObj, *pathObj; @@ -3675,11 +4032,11 @@ InitializeEncodingSearchPath( if (*encodingPtr) { ((Encoding *)(*encodingPtr))->refCount++; } - bytes = TclGetString(searchPathObj); + bytes = Tcl_GetStringFromObj(searchPathObj, &numBytes); - *lengthPtr = searchPathObj->length; - *valuePtr = (char *)ckalloc(*lengthPtr + 1); - memcpy(*valuePtr, bytes, *lengthPtr + 1); + *lengthPtr = numBytes; + *valuePtr = (char *)ckalloc(numBytes + 1); + memcpy(*valuePtr, bytes, numBytes + 1); Tcl_DecrRefCount(searchPathObj); } |
