diff options
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 1654 |
1 files changed, 1252 insertions, 402 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 92217f3..fc2835d 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -3,13 +3,14 @@ * * Contains the implementation of the encoding conversion package. * - * Copyright (c) 1996-1998 Sun Microsystems, Inc. + * Copyright © 1996-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. */ #include "tclInt.h" +#include "tclIO.h" typedef size_t (LengthProc)(const char *src); @@ -33,20 +34,22 @@ typedef struct { Tcl_EncodingFreeProc *freeProc; /* If non-NULL, function to call when this * encoding is deleted. */ - int nullSize; /* Number of 0x00 bytes that signify + void *clientData; /* Arbitrary value associated with encoding + * type. Passed to conversion functions. */ + Tcl_Size nullSize; /* Number of 0x00 bytes that signify * end-of-string in this encoding. This number * is used to determine the source string * length when the srcLen argument is - * negative. This number can be 1 or 2. */ - ClientData clientData; /* Arbitrary value associated with encoding - * type. Passed to conversion functions. */ + * negative. This number can be 1, 2, or 4. */ LengthProc *lengthProc; /* Function to compute length of * null-terminated strings in this encoding. * If nullSize is 1, this is strlen; if * nullSize is 2, this is a function that * returns the number of bytes in a 0x0000 - * terminated string. */ - int refCount; /* Number of uses of this structure. */ + * terminated string; if nullSize is 4, this + * is a function that returns the number of + * bytes in a 0x00000000 terminated string. */ + size_t refCount; /* Number of uses of this structure. */ Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */ } Encoding; @@ -185,6 +188,29 @@ static Tcl_Encoding systemEncoding = NULL; Tcl_Encoding tclIdentityEncoding = NULL; /* + * Names of encoding profiles and corresponding integer values. + * Keep alphabetical order for error messages. + */ +static struct TclEncodingProfiles { + const char *name; + int value; +} encodingProfiles[] = { + {"replace", TCL_ENCODING_PROFILE_REPLACE}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, +}; +#define PROFILE_STRICT(flags_) \ + (CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) + +#define PROFILE_REPLACE(flags_) \ + (CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) + +#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD) +#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) +#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) +#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) + +/* * The following variable is used in the sparse matrix code for a * TableEncoding to represent a page in the table that has no entries. */ @@ -214,51 +240,19 @@ static Tcl_Encoding LoadEscapeEncoding(const char *name, static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, const char *name); static Tcl_EncodingFreeProc TableFreeProc; -static int TableFromUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static int TableToUtfProc(ClientData clientData, const char *src, - int srcLen, int flags, Tcl_EncodingState *statePtr, - char *dst, int dstLen, int *srcReadPtr, - int *dstWrotePtr, int *dstCharsPtr); +static Tcl_EncodingConvertProc TableFromUtfProc; +static Tcl_EncodingConvertProc TableToUtfProc; static size_t unilen(const char *src); -static int UnicodeToUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static int UtfToUnicodeProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static int UtfToUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr, int pureNullMode); -static int UtfIntToUtfExtProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static int UtfExtToUtfIntProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static int Iso88591FromUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static int Iso88591ToUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, - int dstLen, int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); +static size_t unilen4(const char *src); +static Tcl_EncodingConvertProc Utf32ToUtfProc; +static Tcl_EncodingConvertProc UtfToUtf32Proc; +static Tcl_EncodingConvertProc Utf16ToUtfProc; +static Tcl_EncodingConvertProc UtfToUtf16Proc; +static Tcl_EncodingConvertProc UtfToUcs2Proc; +static Tcl_EncodingConvertProc UtfToUtfProc; +static Tcl_EncodingConvertProc Iso88591FromUtfProc; +static Tcl_EncodingConvertProc Iso88591ToUtfProc; + /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field @@ -267,8 +261,28 @@ static int Iso88591ToUtfProc(ClientData clientData, */ static const Tcl_ObjType encodingType = { - "encoding", FreeEncodingInternalRep, DupEncodingInternalRep, NULL, NULL + "encoding", + FreeEncodingInternalRep, + DupEncodingInternalRep, + NULL, + NULL }; + +#define EncodingSetInternalRep(objPtr, encoding) \ + do { \ + Tcl_ObjInternalRep ir; \ + ir.twoPtrValue.ptr1 = (encoding); \ + ir.twoPtrValue.ptr2 = NULL; \ + Tcl_StoreInternalRep((objPtr), &encodingType, &ir); \ + } while (0) + +#define EncodingGetInternalRep(objPtr, encoding) \ + do { \ + const Tcl_ObjInternalRep *irPtr; \ + irPtr = TclFetchInternalRep ((objPtr), &encodingType); \ + (encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL; \ + } while (0) + /* *---------------------------------------------------------------------- @@ -295,17 +309,16 @@ Tcl_GetEncodingFromObj( Tcl_Obj *objPtr, Tcl_Encoding *encodingPtr) { + Tcl_Encoding encoding; const char *name = TclGetString(objPtr); - if (objPtr->typePtr != &encodingType) { - Tcl_Encoding encoding = Tcl_GetEncoding(interp, name); - + EncodingGetInternalRep(objPtr, encoding); + if (encoding == NULL) { + encoding = Tcl_GetEncoding(interp, name); if (encoding == NULL) { return TCL_ERROR; } - TclFreeIntRep(objPtr); - objPtr->internalRep.twoPtrValue.ptr1 = encoding; - objPtr->typePtr = &encodingType; + EncodingSetInternalRep(objPtr, encoding); } *encodingPtr = Tcl_GetEncoding(NULL, name); return TCL_OK; @@ -325,8 +338,10 @@ static void FreeEncodingInternalRep( Tcl_Obj *objPtr) { - Tcl_FreeEncoding((Tcl_Encoding)objPtr->internalRep.twoPtrValue.ptr1); - objPtr->typePtr = NULL; + Tcl_Encoding encoding; + + EncodingGetInternalRep(objPtr, encoding); + Tcl_FreeEncoding(encoding); } /* @@ -344,7 +359,8 @@ DupEncodingInternalRep( Tcl_Obj *srcPtr, Tcl_Obj *dupPtr) { - dupPtr->internalRep.twoPtrValue.ptr1 = Tcl_GetEncoding(NULL, srcPtr->bytes); + Tcl_Encoding encoding = Tcl_GetEncoding(NULL, TclGetString(srcPtr)); + EncodingSetInternalRep(dupPtr, encoding); } /* @@ -382,9 +398,9 @@ int Tcl_SetEncodingSearchPath( Tcl_Obj *searchPath) { - int dummy; + Tcl_Size dummy; - if (TCL_ERROR == TclListObjLength(NULL, searchPath, &dummy)) { + if (TCL_ERROR == TclListObjLengthM(NULL, searchPath, &dummy)) { return TCL_ERROR; } TclSetProcessGlobalValue(&encodingSearchPath, searchPath, NULL); @@ -429,9 +445,9 @@ void TclSetLibraryPath( Tcl_Obj *path) { - int dummy; + Tcl_Size dummy; - if (TCL_ERROR == TclListObjLength(NULL, path, &dummy)) { + if (TCL_ERROR == TclListObjLengthM(NULL, path, &dummy)) { return; } TclSetProcessGlobalValue(&libraryPath, path, NULL); @@ -465,22 +481,22 @@ TclSetLibraryPath( static void FillEncodingFileMap(void) { - int i, numDirs = 0; + Tcl_Size i, numDirs = 0; Tcl_Obj *map, *searchPath; searchPath = Tcl_GetEncodingSearchPath(); Tcl_IncrRefCount(searchPath); - TclListObjLength(NULL, searchPath, &numDirs); + TclListObjLengthM(NULL, searchPath, &numDirs); map = Tcl_NewDictObj(); Tcl_IncrRefCount(map); - for (i = numDirs-1; i >= 0; i--) { + for (i = numDirs-1; i != TCL_INDEX_NONE; i--) { /* * Iterate backwards through the search path so as we overwrite * entries found, we favor files earlier on the search path. */ - int j, numFiles; + Tcl_Size j, numFiles; Tcl_Obj *directory, *matchFileList; Tcl_Obj **filev; Tcl_GlobTypeData readableFiles = { @@ -494,7 +510,7 @@ FillEncodingFileMap(void) Tcl_FSMatchInDirectory(NULL, matchFileList, directory, "*.enc", &readableFiles); - TclListObjGetElements(NULL, matchFileList, &numFiles, &filev); + TclListObjGetElementsM(NULL, matchFileList, &numFiles, &filev); for (j=0; j<numFiles; j++) { Tcl_Obj *encodingName, *fileObj; @@ -529,6 +545,17 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ +/* + * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS + * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this + * when adding bits. TODO - should really be defined in a single file. + * + * To prevent conflicting bits, only define bits within 0xff00 mask here. + */ +#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */ +#define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ +#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ + void TclInitEncodingSubsystem(void) { @@ -540,12 +567,16 @@ TclInitEncodingSubsystem(void) char c; short s; } isLe; + int leFlags; if (encodingsInitialized) { return; } + /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */ isLe.s = 1; + leFlags = isLe.c ? TCL_ENCODING_LE : 0; + Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); @@ -556,7 +587,7 @@ TclInitEncodingSubsystem(void) * properly formed stream. */ - type.encodingName = "identity"; + type.encodingName = NULL; type.toUtfProc = BinaryProc; type.fromUtfProc = BinaryProc; type.freeProc = NULL; @@ -565,21 +596,63 @@ TclInitEncodingSubsystem(void) tclIdentityEncoding = Tcl_CreateEncoding(&type); type.encodingName = "utf-8"; - type.toUtfProc = UtfExtToUtfIntProc; - type.fromUtfProc = UtfIntToUtfExtProc; + type.toUtfProc = UtfToUtfProc; + type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); - type.encodingName = "unicode"; - type.toUtfProc = UnicodeToUtfProc; - type.fromUtfProc = UtfToUnicodeProc; + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUcs2Proc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = INT2PTR(isLe.c); + type.encodingName = "ucs-2le"; + type.clientData = INT2PTR(TCL_ENCODING_LE); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2"; + type.clientData = INT2PTR(leFlags); Tcl_CreateEncoding(&type); + type.toUtfProc = Utf32ToUtfProc; + type.fromUtfProc = UtfToUtf32Proc; + type.freeProc = NULL; + type.nullSize = 4; + type.encodingName = "utf-32le"; + type.clientData = INT2PTR(TCL_ENCODING_LE); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-32be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-32"; + type.clientData = INT2PTR(leFlags); + Tcl_CreateEncoding(&type); + + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUtf16Proc; + type.freeProc = NULL; + type.nullSize = 2; + type.encodingName = "utf-16le"; + type.clientData = INT2PTR(TCL_ENCODING_LE|ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16be"; + type.clientData = INT2PTR(ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16"; + type.clientData = INT2PTR(leFlags|ENCODING_UTF); + Tcl_CreateEncoding(&type); + +#ifndef TCL_NO_DEPRECATED + type.encodingName = "unicode"; + Tcl_CreateEncoding(&type); +#endif + /* * Need the iso8859-1 encoding in order to process binary data, so force * it to always be embedded. Note that this encoding *must* be a proper @@ -686,13 +759,14 @@ TclFinalizeEncodingSubsystem(void) *------------------------------------------------------------------------- */ +#if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9 const char * Tcl_GetDefaultEncodingDir(void) { int numDirs; Tcl_Obj *first, *searchPath = Tcl_GetEncodingSearchPath(); - TclListObjLength(NULL, searchPath, &numDirs); + TclListObjLengthM(NULL, searchPath, &numDirs); if (numDirs == 0) { return NULL; } @@ -723,12 +797,13 @@ Tcl_SetDefaultEncodingDir( const char *path) { Tcl_Obj *searchPath = Tcl_GetEncodingSearchPath(); - Tcl_Obj *directory = Tcl_NewStringObj(path, -1); + Tcl_Obj *directory = Tcl_NewStringObj(path, TCL_INDEX_NONE); searchPath = Tcl_DuplicateObj(searchPath); Tcl_ListObjReplace(NULL, searchPath, 0, 0, 1, &directory); Tcl_SetEncodingSearchPath(searchPath); } +#endif /* *------------------------------------------------------------------------- @@ -834,9 +909,6 @@ FreeEncoding( if (encodingPtr == NULL) { return; } - if (encodingPtr->refCount<=0) { - Tcl_Panic("FreeEncoding: refcount problem !!!"); - } if (encodingPtr->refCount-- <= 1) { if (encodingPtr->freeProc != NULL) { encodingPtr->freeProc(encodingPtr->clientData); @@ -856,7 +928,7 @@ FreeEncoding( * * Tcl_GetEncodingName -- * - * Given an encoding, return the name that was used to constuct the + * Given an encoding, return the name that was used to construct the * encoding. * * Results: @@ -921,7 +993,7 @@ Tcl_GetEncodingNames( Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); Tcl_CreateHashEntry(&table, - Tcl_NewStringObj(encodingPtr->name, -1), &dummy); + Tcl_NewStringObj(encodingPtr->name, TCL_INDEX_NONE), &dummy); } Tcl_MutexUnlock(&encodingMutex); @@ -951,6 +1023,33 @@ Tcl_GetEncodingNames( } /* + *------------------------------------------------------------------------- + * + * Tcl_GetEncodingNulLength -- + * + * Given an encoding, return the number of nul bytes used for the + * string termination. + * + * Results: + * The number of nul bytes used for the string termination. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ +Tcl_Size +Tcl_GetEncodingNulLength( + Tcl_Encoding encoding) +{ + if (encoding == NULL) { + encoding = systemEncoding; + } + + return ((Encoding *) encoding)->nullSize; +} + +/* *------------------------------------------------------------------------ * * Tcl_SetSystemEncoding -- @@ -1035,9 +1134,26 @@ Tcl_CreateEncoding( const Tcl_EncodingType *typePtr) /* The encoding type. */ { + Encoding *encodingPtr = (Encoding *)ckalloc(sizeof(Encoding)); + encodingPtr->name = NULL; + encodingPtr->toUtfProc = typePtr->toUtfProc; + encodingPtr->fromUtfProc = typePtr->fromUtfProc; + encodingPtr->freeProc = typePtr->freeProc; + encodingPtr->nullSize = typePtr->nullSize; + encodingPtr->clientData = typePtr->clientData; + if (typePtr->nullSize == 2) { + encodingPtr->lengthProc = (LengthProc *) unilen; + } else if (typePtr->nullSize == 4) { + encodingPtr->lengthProc = (LengthProc *) unilen4; + } else { + encodingPtr->lengthProc = (LengthProc *) strlen; + } + encodingPtr->refCount = 1; + encodingPtr->hPtr = NULL; + + if (typePtr->encodingName) { Tcl_HashEntry *hPtr; int isNew; - Encoding *encodingPtr; char *name; Tcl_MutexLock(&encodingMutex); @@ -1048,30 +1164,17 @@ Tcl_CreateEncoding( * reference goes away. */ - encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); - encodingPtr->hPtr = NULL; + Encoding *replaceMe = (Encoding *)Tcl_GetHashValue(hPtr); + replaceMe->hPtr = NULL; } name = (char *)ckalloc(strlen(typePtr->encodingName) + 1); - - encodingPtr = (Encoding *)ckalloc(sizeof(Encoding)); encodingPtr->name = strcpy(name, typePtr->encodingName); - encodingPtr->toUtfProc = typePtr->toUtfProc; - encodingPtr->fromUtfProc = typePtr->fromUtfProc; - encodingPtr->freeProc = typePtr->freeProc; - encodingPtr->nullSize = typePtr->nullSize; - encodingPtr->clientData = typePtr->clientData; - if (typePtr->nullSize == 1) { - encodingPtr->lengthProc = (LengthProc *) strlen; - } else { - encodingPtr->lengthProc = (LengthProc *) unilen; - } - encodingPtr->refCount = 1; encodingPtr->hPtr = hPtr; Tcl_SetHashValue(hPtr, encodingPtr); Tcl_MutexUnlock(&encodingMutex); - + } return (Tcl_Encoding) encodingPtr; } @@ -1101,24 +1204,102 @@ Tcl_ExternalToUtfDString( Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ - int srcLen, /* Source string length in bytes, or < 0 for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_ExternalToUtfDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_ExternalToUtfDStringEx -- + * + * Convert a source buffer from the specified encoding into UTF-8. + * The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in utf-8. + * Possible flags values: + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * Any other flag bits will cause an error to be returned (for future + * compatibility) + * + * Results: + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. + * + * Side effects: + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner. + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. + * + *------------------------------------------------------------------------- + */ + +int +Tcl_ExternalToUtfDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ + Tcl_Encoding encoding, /* The encoding for the source string, or NULL + * for the default system encoding. */ + const char *src, /* Source string in specified encoding. */ + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for + * encoding-specific string length. */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the + * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; + int result, srcRead, dstWrote, dstChars; + Tcl_Size dstLen, soFar; + const char *srcStart = src; + /* DO FIRST - Must always be initialized before returning */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetObjResult( + interp, + Tcl_NewStringObj( + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_INDEX_NONE)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; if (encoding == NULL) { encoding = systemEncoding; } - encodingPtr = (Encoding *) encoding; + encodingPtr = (Encoding *)encoding; if (src == NULL) { srcLen = 0; @@ -1126,20 +1307,48 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START | TCL_ENCODING_END; + if (encodingPtr->toUtfProc == UtfToUtfProc) { + flags |= ENCODING_INPUT; + } while (1) { - result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); + result = encodingPtr->toUtfProc(encodingPtr->clientData, src, + srcLen, flags, &state, dst, dstLen, + &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + src += srcRead; if (result != TCL_CONVERT_NOSPACE) { + Tcl_Size nBytesProcessed = (src - srcStart); + Tcl_DStringSetLength(dstPtr, soFar); - return Tcl_DStringValue(dstPtr); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + char buf[TCL_INTEGER_SPACE]; + sprintf(buf, "%u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf("unexpected byte sequence starting at index %" + "u: '\\x%02X'", + nBytesProcessed, + UCHAR(srcStart[nBytesProcessed]))); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL); + } + } + return result; } + /* Expand space and continue */ flags &= ~TCL_ENCODING_START; - src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -1170,11 +1379,11 @@ Tcl_ExternalToUtfDString( int Tcl_ExternalToUtf( - Tcl_Interp *interp, /* Interp for error return, if not NULL. */ + TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */ Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ - int srcLen, /* Source string length in bytes, or < 0 for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state @@ -1184,7 +1393,7 @@ Tcl_ExternalToUtf( * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ - int dstLen, /* The maximum length of output buffer in + Tcl_Size dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the * source string that were converted. This may @@ -1233,21 +1442,24 @@ Tcl_ExternalToUtf( } if (!noTerminate) { - if (dstLen < 1) { - return TCL_CONVERT_NOSPACE; - } + if (dstLen < 1) { + return TCL_CONVERT_NOSPACE; + } /* * If there are any null characters in the middle of the buffer, - * they will converted to the UTF-8 null character (\xC080). To get + * they will converted to the UTF-8 null character (\xC0\x80). To get * the actual \0 at the end of the destination buffer, we need to * append it manually. First make room for it... */ dstLen--; } else { - if (dstLen < 0) { - return TCL_CONVERT_NOSPACE; - } + if (dstLen < 0) { + return TCL_CONVERT_NOSPACE; + } + } + if (encodingPtr->toUtfProc == UtfToUtfProc) { + flags |= ENCODING_INPUT; } do { Tcl_EncodingState savedState = *statePtr; @@ -1258,7 +1470,7 @@ Tcl_ExternalToUtf( if (*dstCharsPtr <= maxChars) { break; } - dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); + dstLen = TclUtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); *statePtr = savedState; } while (1); if (!noTerminate) { @@ -1295,17 +1507,92 @@ Tcl_UtfToExternalDString( Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ - int srcLen, /* Source string length in bytes, or < 0 for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_UtfToExternalDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_UtfToExternalDStringEx -- + * + * Convert a source buffer from UTF-8 to the specified encoding. + * The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in the + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * + * Results: + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. + * + * Side effects: + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. + * + *------------------------------------------------------------------------- + */ + +int +Tcl_UtfToExternalDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ + Tcl_Encoding encoding, /* The encoding for the converted string, or + * NULL for the default system encoding. */ + const char *src, /* Source string in UTF-8. */ + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for + * strlen(). */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the + * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; + int result, srcRead, dstWrote, dstChars; + const char *srcStart = src; + Tcl_Size dstLen, soFar; + /* DO FIRST - must always be initialized on return */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetObjResult( + interp, + Tcl_NewStringObj( + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_INDEX_NONE)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; @@ -1319,23 +1606,49 @@ Tcl_UtfToExternalDString( } else if (srcLen < 0) { srcLen = strlen(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + + flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, - srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, - &dstChars); + srcLen, flags, &state, dst, dstLen, + &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + src += srcRead; if (result != TCL_CONVERT_NOSPACE) { - if (encodingPtr->nullSize == 2) { - Tcl_DStringSetLength(dstPtr, soFar + 1); + Tcl_Size nBytesProcessed = (src - srcStart); + int i = soFar + encodingPtr->nullSize - 1; + while (i >= soFar) { + Tcl_DStringSetLength(dstPtr, i--); } - Tcl_DStringSetLength(dstPtr, soFar); - return Tcl_DStringValue(dstPtr); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + int pos = Tcl_NumUtfChars(srcStart, nBytesProcessed); + int ucs4; + char buf[TCL_INTEGER_SPACE]; + TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4); + sprintf(buf, "%u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "unexpected character at index %d: 'U+%06X'", + pos, + ucs4)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", + buf, NULL); + } + } + return result; } flags &= ~TCL_ENCODING_START; - src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -1366,11 +1679,11 @@ Tcl_UtfToExternalDString( int Tcl_UtfToExternal( - Tcl_Interp *interp, /* Interp for error return, if not NULL. */ + TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */ Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ - int srcLen, /* Source string length in bytes, or < 0 for + Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state @@ -1380,7 +1693,7 @@ Tcl_UtfToExternal( * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string * is stored. */ - int dstLen, /* The maximum length of output buffer in + Tcl_Size dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the * source string that were converted. This may @@ -1423,20 +1736,17 @@ Tcl_UtfToExternal( } if (dstLen < encodingPtr->nullSize) { - return TCL_CONVERT_NOSPACE; + return TCL_CONVERT_NOSPACE; } dstLen -= encodingPtr->nullSize; result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, - flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, - dstCharsPtr); + flags, statePtr, dst, dstLen, srcReadPtr, + dstWrotePtr, dstCharsPtr); /* * Buffer is terminated irrespective of result. Not sure this is * reasonable but keep for historical/compatibility reasons. */ - if (encodingPtr->nullSize == 2) { - dst[*dstWrotePtr + 1] = '\0'; - } - dst[*dstWrotePtr] = '\0'; + memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize); return result; } @@ -1459,14 +1769,15 @@ Tcl_UtfToExternal( *--------------------------------------------------------------------------- */ #undef Tcl_FindExecutable -void +const char * Tcl_FindExecutable( const char *argv0) /* The value of the application's argv[0] * (native). */ { - TclInitSubsystems(); + const char *version = Tcl_InitSubsystems(); TclpSetInitialEncodings(); TclpFindExecutable(argv0); + return version; } /* @@ -1494,17 +1805,17 @@ OpenEncodingFileChannel( const char *name) /* The name of the encoding file on disk and * also the name for new encoding. */ { - Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1); + Tcl_Obj *nameObj = Tcl_NewStringObj(name, TCL_INDEX_NONE); Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj); Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath()); Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap); Tcl_Obj **dir, *path, *directory = NULL; Tcl_Channel chan = NULL; - int i, numDirs; + Tcl_Size i, numDirs; - TclListObjGetElements(NULL, searchPath, &numDirs, &dir); + TclListObjGetElementsM(NULL, searchPath, &numDirs, &dir); Tcl_IncrRefCount(nameObj); - Tcl_AppendToObj(fileNameObj, ".enc", -1); + Tcl_AppendToObj(fileNameObj, ".enc", TCL_INDEX_NONE); Tcl_IncrRefCount(fileNameObj); Tcl_DictObjGet(NULL, map, nameObj, &directory); @@ -1723,7 +2034,7 @@ LoadTableEncoding( }; Tcl_DStringInit(&lineString); - if (Tcl_Gets(chan, &lineString) == -1) { + if (Tcl_Gets(chan, &lineString) == TCL_IO_FAILURE) { return NULL; } line = Tcl_DStringValue(&lineString); @@ -1765,7 +2076,7 @@ LoadTableEncoding( for (i = 0; i < numPages; i++) { int ch; const char *p; - int expected = 3 + 16 * (16 * 4 + 1); + Tcl_Size expected = 3 + 16 * (16 * 4 + 1); if (Tcl_ReadChars(chan, objPtr, expected, 0) != expected) { return NULL; @@ -1849,8 +2160,8 @@ LoadTableEncoding( */ if (dataPtr->fromUnicode[0] != NULL) { - if (dataPtr->fromUnicode[0]['\\'] == '\0') { - dataPtr->fromUnicode[0]['\\'] = '\\'; + if (dataPtr->fromUnicode[0][(int)'\\'] == '\0') { + dataPtr->fromUnicode[0][(int)'\\'] = '\\'; } } } @@ -2001,7 +2312,7 @@ LoadEscapeEncoding( Tcl_DStringInit(&escapeData); while (1) { - int argc; + Tcl_Size argc; const char **argv; char *line; Tcl_DString lineString; @@ -2053,7 +2364,7 @@ LoadEscapeEncoding( Tcl_DStringFree(&lineString); } - size = TclOffset(EscapeEncodingData, subTables) + size = offsetof(EscapeEncodingData, subTables) + Tcl_DStringLength(&escapeData); dataPtr = (EscapeEncodingData *)ckalloc(size); dataPtr->initLen = strlen(init); @@ -2111,15 +2422,11 @@ LoadEscapeEncoding( static int BinaryProc( - ClientData clientData, /* Not used. */ + TCL_UNUSED(void *), const char *src, /* Source string (unknown encoding). */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2140,6 +2447,7 @@ BinaryProc( if (dstLen < 0) { dstLen = 0; } + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { srcLen = *dstCharsPtr; } @@ -2158,11 +2466,11 @@ BinaryProc( /* *------------------------------------------------------------------------- * - * UtfIntToUtfExtProc -- + * UtfToUtfProc -- * - * Convert from UTF-8 to UTF-8. While converting null-bytes from the - * Tcl's internal representation (0xC0, 0x80) to the official - * representation (0x00). See UtfToUtfProc for details. + * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation + * is not a no-op, because it turns a stream of improperly formed + * UTF-8 into a properly-formed stream. * * Results: * Returns TCL_OK if conversion was successful. @@ -2174,18 +2482,14 @@ BinaryProc( */ static int -UtfIntToUtfExtProc( - ClientData clientData, /* Not used. */ +UtfToUtfProc( + void *clientData, /* additional flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ - char *dst, /* Output buffer in which converted string - * is stored. */ + int flags, /* TCL_ENCODING_* conversion control flags. */ + TCL_UNUSED(Tcl_EncodingState *), + char *dst, /* Output buffer in which converted string is + * stored. */ int dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the @@ -2200,18 +2504,195 @@ UtfIntToUtfExtProc( * correspond to the bytes stored in the * output buffer. */ { - return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 1); -} + const char *srcStart, *srcEnd, *srcClose; + const char *dstStart, *dstEnd; + int result, numChars, charLimit = INT_MAX; + int ch; + int profile; + + result = TCL_OK; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= 6; + } + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } + + dstStart = dst; + flags |= PTR2INT(clientData); + dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); + + profile = CHANNEL_PROFILE_GET(flags); + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) { + /* + * Copy 7bit characters, but skip null-bytes when we are in input + * mode, so that they get converted to \xC0\x80. + */ + *dst++ = *src++; + } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && + (UCHAR(src[1]) == 0x80) && + (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || + PROFILE_REPLACE(profile))) { + /* Special sequence \xC0\x80 */ + if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) { + if (PROFILE_REPLACE(profile)) { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + src += 2; + } else { + /* PROFILE_STRICT */ + result = TCL_CONVERT_SYNTAX; + break; + } + } else { + /* + * Convert 0xC080 to real nulls when we are in output mode, + * irrespective of the profile. + */ + *dst++ = 0; + src += 2; + } + + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { + /* + * Incomplete byte sequence. + * Always check before using TclUtfToUCS4. Not doing can so + * cause it run beyond the end of the buffer! If we happen such an + * incomplete char its bytes are made to represent themselves + * unless the user has explicitly asked to be told. + */ + + if (flags & ENCODING_INPUT) { + /* Incomplete bytes for modified UTF-8 target */ + if (PROFILE_STRICT(profile)) { + result = (flags & TCL_ENCODING_CHAR_LIMIT) + ? TCL_CONVERT_MULTIBYTE + : TCL_CONVERT_SYNTAX; + break; + } + } + if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + ++src; + } else { + /* TCL_ENCODING_PROFILE_TCL8 */ + char chbuf[2]; + chbuf[0] = UCHAR(*src++); chbuf[1] = 0; + TclUtfToUCS4(chbuf, &ch); + } + dst += Tcl_UniCharToUtf(ch, dst); + } else { + int low; + int isInvalid = 0; + size_t len = TclUtfToUCS4(src, &ch); + if (flags & ENCODING_INPUT) { + if ((len < 2) && (ch != 0)) { + isInvalid = 1; + } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) { + isInvalid = 1; + } + if (isInvalid) { + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_SYNTAX; + break; + } else if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } + } + } + const char *saveSrc = src; + src += len; + if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) { + if (ch > 0xFFFF) { + /* CESU-8 6-byte sequence for chars > U+FFFF */ + ch -= 0x10000; + *dst++ = 0xED; + *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); + *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); + ch = (ch & 0x0CFF) | 0xDC00; + } + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { + /* + * A surrogate character is detected, handle especially. + */ + if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } else { + low = ch; + len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0; + + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { + + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } +cesu8: + *dst++ = (char)(((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char)(((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char)((ch | 0x80) & 0xBF); + continue; + } + src += len; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; + } + } else if (PROFILE_STRICT(profile) && + (!(flags & ENCODING_INPUT)) && + SURROGATE(ch)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } else if (PROFILE_STRICT(profile) && + (flags & ENCODING_INPUT) && + SURROGATE(ch)) { + result = TCL_CONVERT_SYNTAX; + src = saveSrc; + break; + } + dst += Tcl_UniCharToUtf(ch, dst); + } + } + + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + /* *------------------------------------------------------------------------- * - * UtfExtToUtfIntProc -- + * Utf32ToUtfProc -- * - * Convert from UTF-8 to UTF-8 while converting null-bytes from the - * official representation (0x00) to Tcl's internal representation (0xC0, - * 0x80). See UtfToUtfProc for details. + * Convert from UTF-32 to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2223,16 +2704,12 @@ UtfIntToUtfExtProc( */ static int -UtfExtToUtfIntProc( - ClientData clientData, /* Not used. */ - const char *src, /* Source string in UTF-8. */ +Utf32ToUtfProc( + void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ + const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2249,18 +2726,128 @@ UtfExtToUtfIntProc( * correspond to the bytes stored in the * output buffer. */ { - return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, - srcReadPtr, dstWrotePtr, dstCharsPtr, 0); -} + const char *srcStart, *srcEnd; + const char *dstEnd, *dstStart; + int result, numChars, charLimit = INT_MAX; + int ch = 0, bytesLeft = srcLen % 4; + + flags = TclEncodingSetProfileFlags(flags); + flags |= PTR2INT(clientData); + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } + result = TCL_OK; + + /* + * Check alignment with utf-32 (4 == sizeof(UTF-32)) + */ + if (bytesLeft != 0) { + /* We have a truncated code unit */ + result = TCL_CONVERT_MULTIBYTE; + srcLen -= bytesLeft; + } + + /* + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. + */ + + if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) { + result = TCL_CONVERT_MULTIBYTE; + srcLen-= 4; + } + + srcStart = src; + srcEnd = src + srcLen; + + dstStart = dst; + dstEnd = dst + dstLen - TCL_UTF_MAX; + + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + + int prev = ch; + if (flags & TCL_ENCODING_LE) { + ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); + } + if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } + + if ((unsigned)ch > 0x10FFFF) { + ch = UNICODE_REPLACE_CHAR; + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + break; + } + } else if (PROFILE_STRICT(flags) && SURROGATE(ch)) { + result = TCL_CONVERT_SYNTAX; + ch = 0; + break; + } else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) { + ch = UNICODE_REPLACE_CHAR; + } + + /* + * Special case for 1-byte utf chars for speed. Make sure we work with + * unsigned short-size data. + */ + + if ((unsigned)ch - 1 < 0x7F) { + *dst++ = (ch & 0xFF); + } else { + if (!HIGH_SURROGATE(prev) && LOW_SURROGATE(ch)) { + *dst = 0; /* In case of lower surrogate, don't try to combine */ + } + dst += Tcl_UniCharToUtf(ch, dst); + } + src += 4; + } + + if (HIGH_SURROGATE(ch)) { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } + + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { + /* We have a code fragment left-over at the end */ + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + /* destination is not full, so we really are at the end now */ + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src += bytesLeft; /* Go past truncated code unit */ + } + } + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + /* *------------------------------------------------------------------------- * - * UtfToUtfProc -- + * UtfToUtf32Proc -- * - * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation - * is not a no-op, because it turns a stream of improperly formed - * UTF-8 into a properly-formed stream. + * Convert from UTF-8 to UTF-32. * * Results: * Returns TCL_OK if conversion was successful. @@ -2272,16 +2859,12 @@ UtfExtToUtfIntProc( */ static int -UtfToUtfProc( - ClientData clientData, /* Not used. */ +UtfToUtf32Proc( + void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2294,37 +2877,28 @@ UtfToUtfProc( int *dstWrotePtr, /* Filled with the number of bytes that were * stored in the output buffer as a result of * the conversion. */ - int *dstCharsPtr, /* Filled with the number of characters that + int *dstCharsPtr) /* Filled with the number of characters that * correspond to the bytes stored in the * output buffer. */ - int pureNullMode) /* Convert embedded nulls from internal - * representation to real null-bytes or vice - * versa. Also combine or separate surrogate pairs */ { - const char *srcStart, *srcEnd, *srcClose; - const char *dstStart, *dstEnd; - int result, numChars, charLimit = INT_MAX; - Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; - - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } - result = TCL_OK; + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars; + int ch, len; srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { - srcClose -= 6; - } - if (flags & TCL_ENCODING_CHAR_LIMIT) { - charLimit = *dstCharsPtr; + srcClose -= TCL_UTF_MAX; } dstStart = dst; - dstEnd = dst + dstLen - ((pureNullMode == 1) ? 4 : TCL_UTF_MAX); + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + flags |= PTR2INT(clientData); - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the @@ -2338,77 +2912,27 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (pureNullMode == 0))) { - /* - * Copy 7bit characters, but skip null-bytes when we are in input - * mode, so that they get converted to 0xC080. - */ - - *dst++ = *src++; - *chPtr = 0; /* reset surrogate handling */ - } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && (pureNullMode == 1)) { - /* - * Convert 0xC080 to real nulls when we are in output mode. - */ - - *dst++ = 0; - *chPtr = 0; /* reset surrogate handling */ - src += 2; - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { - /* - * Always check before using TclUtfToUniChar. Not doing can so - * cause it run beyond the end of the buffer! If we happen such an - * incomplete char its bytes are made to represent themselves - * unless the user has explicitly asked to be told. - */ - - if ((flags & TCL_ENCODING_STOPONERROR) && (pureNullMode == 0)) { - result = TCL_CONVERT_MULTIBYTE; + len = TclUtfToUCS4(src, &ch); + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; break; } - *chPtr = UCHAR(*src); - src += 1; - dst += Tcl_UniCharToUtf(*chPtr, dst); - } else { - size_t len = TclUtfToUniChar(src, chPtr); - if ((len < 2) && (*chPtr != 0) && (flags & TCL_ENCODING_STOPONERROR) - && ((*chPtr & ~0x7FF) != 0xD800) && (pureNullMode == 0)) { - result = TCL_CONVERT_SYNTAX; - break; + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; } - src += len; - if ((*chPtr & ~0x7FF) == 0xD800) { - Tcl_UniChar low; - /* A surrogate character is detected, handle especially */ -#if TCL_UTF_MAX <= 4 - if ((len < 3) && ((src[3 - len] & 0xC0) != 0x80)) { - /* It's invalid. See [ed29806ba] */ - *chPtr = UCHAR(src[-1]); - dst += Tcl_UniCharToUtf(*chPtr, dst); - continue; - } -#endif - low = *chPtr; - len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0; - if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { - *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((*chPtr | 0x80) & 0xBF); - *chPtr = 0; /* reset surrogate handling */ - continue; - } else if ((TCL_UTF_MAX > 3) || (pureNullMode == 1)) { - int full = (((*chPtr & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; - *dst++ = (char) (((full >> 18) | 0xF0) & 0xF7); - *dst++ = (char) (((full >> 12) | 0x80) & 0xBF); - *dst++ = (char) (((full >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((full | 0x80) & 0xBF); - *chPtr = 0; /* reset surrogate handling */ - src += len; - continue; - } - } - dst += Tcl_UniCharToUtf(*chPtr, dst); + } + src += len; + if (flags & TCL_ENCODING_LE) { + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 24) & 0xFF); + } else { + *dst++ = ((ch >> 24) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = (ch & 0xFF); } } @@ -2421,7 +2945,7 @@ UtfToUtfProc( /* *------------------------------------------------------------------------- * - * UnicodeToUtfProc -- + * Utf16ToUtfProc -- * * Convert from UTF-16 to UTF-8. * @@ -2435,16 +2959,12 @@ UtfToUtfProc( */ static int -UnicodeToUtfProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ +Utf16ToUtfProc( + void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2464,8 +2984,10 @@ UnicodeToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - unsigned short ch; + unsigned short ch = 0; + flags = TclEncodingSetProfileFlags(flags); + flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -2480,18 +3002,16 @@ UnicodeToUtfProc( srcLen--; } -#if TCL_UTF_MAX > 3 /* * If last code point is a high surrogate, we cannot handle that yet, * unless we are at the end. */ if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) && - ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } -#endif srcStart = src; srcEnd = src + srcLen; @@ -2505,37 +3025,76 @@ UnicodeToUtfProc( break; } - if (clientData) { + unsigned short prev = ch; + if (flags & TCL_ENCODING_LE) { ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } + if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; /* Go back to beginning of high surrogate */ + dst--; /* Also undo writing a single byte too much */ + numChars--; + break; + } + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } /* * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (ch & 0xFF); + } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) { + dst += Tcl_UniCharToUtf(ch, dst); + } else if (LOW_SURROGATE(ch) && PROFILE_STRICT(flags)) { + /* Lo surrogate not preceded by Hi surrogate */ + result = TCL_CONVERT_SYNTAX; + break; } else { + *dst = 0; /* In case of lower surrogate, don't try to combine */ dst += Tcl_UniCharToUtf(ch, dst); } src += sizeof(unsigned short); } + if (HIGH_SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + src -= 2; + dst--; + numChars--; + } else { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } + } + + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + */ if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - /* We have a single byte left-over at the end */ if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - /* destination is not full, so we really are at the end now */ - result = TCL_OK; - dst += Tcl_UniCharToUtf(0xFFFD, dst); - numChars++; - src++; + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src++; /* Go past truncated code unit */ + } } } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2545,7 +3104,7 @@ UnicodeToUtfProc( /* *------------------------------------------------------------------------- * - * UtfToUnicodeProc -- + * UtfToUtf16Proc -- * * Convert from UTF-8 to UTF-16. * @@ -2559,16 +3118,12 @@ UnicodeToUtfProc( */ static int -UtfToUnicodeProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ +UtfToUtf16Proc( + void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2587,20 +3142,19 @@ UtfToUnicodeProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + int ch, len; - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } dstStart = dst; dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + flags |= PTR2INT(clientData); result = TCL_OK; for (numChars = 0; src < srcEnd; numChars++) { @@ -2617,38 +3171,154 @@ UtfToUnicodeProc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUniChar(src, chPtr); - - if (clientData) { -#if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); + len = TclUtfToUCS4(src, &ch); + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; + } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } + } + src += len; + if (flags & TCL_ENCODING_LE) { + if (ch <= 0xFFFF) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); } else { - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (*chPtr & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; } -#else - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); -#endif } else { -#if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); + if (ch <= 0xFFFF) { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); } else { - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; - *dst++ = (*chPtr & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; + *dst++ = (ch & 0xFF); + } + } + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * + * UtfToUcs2Proc -- + * + * Convert from UTF-8 to UCS-2. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +UtfToUcs2Proc( + void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + TCL_UNUSED(Tcl_EncodingState *), + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars, len; + Tcl_UniChar ch = 0; + + flags = TclEncodingSetProfileFlags(flags); + flags |= PTR2INT(clientData); + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } +#if TCL_UTF_MAX < 4 + len = TclUtfToUniChar(src, &ch); + if ((ch >= 0xD800) && (len < 3)) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; } + src += len; + src += TclUtfToUniChar(src, &ch); + ch = UNICODE_REPLACE_CHAR; + } #else - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); + len = TclUtfToUniChar(src, &ch); + if (ch > 0xFFFF) { + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = UNICODE_REPLACE_CHAR; + } #endif + if (PROFILE_STRICT(flags) && SURROGATE(ch)) { + result = TCL_CONVERT_SYNTAX; + break; + } + + src += len; + + /* + * Need to handle this in a way that won't cause misalignment by + * casting dst to a Tcl_UniChar. [Bug 1122671] + */ + + if (flags & TCL_ENCODING_LE) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); + } else { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); } } *srcReadPtr = src - srcStart; @@ -2676,16 +3346,12 @@ UtfToUnicodeProc( static int TableToUtfProc( - ClientData clientData, /* TableEncodingData that specifies + void *clientData, /* TableEncodingData that specifies * encoding. */ const char *src, /* Source string in specified encoding. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2710,6 +3376,7 @@ TableToUtfProc( const unsigned short *pageZero; TableEncodingData *dataPtr = (TableEncodingData *)clientData; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -2733,30 +3400,47 @@ TableToUtfProc( if (prefixBytes[byte]) { src++; if (src >= srcEnd) { - src--; - result = TCL_CONVERT_MULTIBYTE; - break; + if (!(flags & TCL_ENCODING_END)) { + src--; + result = TCL_CONVERT_MULTIBYTE; + break; + } else if (PROFILE_STRICT(flags)) { + src--; + result = TCL_CONVERT_SYNTAX; + break; + } else if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + src--; /* See bug [bdcb5126c0] */ + result = TCL_CONVERT_MULTIBYTE; + break; + } + } else { + ch = toUnicode[byte][*((unsigned char *)src)]; } - ch = toUnicode[byte][*((unsigned char *) src)]; } else { ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } if (prefixBytes[byte]) { src--; } - ch = (Tcl_UniChar) byte; + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + ch = (Tcl_UniChar)byte; + } } /* * Special case for 1-byte utf chars for speed. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -2789,16 +3473,12 @@ TableToUtfProc( static int TableFromUtfProc( - ClientData clientData, /* TableEncodingData that specifies + void *clientData, /* TableEncodingData that specifies * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2830,6 +3510,7 @@ TableFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -2849,12 +3530,12 @@ TableFromUtfProc( } len = TclUtfToUniChar(src, &ch); -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 /* Unicode chars > +U0FFFF cannot be represented in any table encoding */ if (ch & 0xFFFF0000) { word = 0; } else -#elif TCL_UTF_MAX == 4 +#else if (!len) { word = 0; } else @@ -2862,11 +3543,11 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } - word = dataPtr->fallback; + word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */ } if (prefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { @@ -2911,15 +3592,11 @@ TableFromUtfProc( static int Iso88591ToUtfProc( - ClientData clientData, /* Ignored. */ + TCL_UNUSED(void *), const char *src, /* Source string in specified encoding. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2940,6 +3617,7 @@ Iso88591ToUtfProc( const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -2963,7 +3641,7 @@ Iso88591ToUtfProc( * Special case for 1-byte utf chars for speed. */ - if (ch && ch < 0x80) { + if ((unsigned)ch - 1 < 0x7F) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -2995,15 +3673,11 @@ Iso88591ToUtfProc( static int Iso88591FromUtfProc( - ClientData clientData, /* Ignored. */ + TCL_UNUSED(void *), const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -3028,6 +3702,7 @@ Iso88591FromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3054,15 +3729,15 @@ Iso88591FromUtfProc( */ if (ch > 0xFF -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX < 4 || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (len < 3)) { len = 4; } @@ -3071,7 +3746,7 @@ Iso88591FromUtfProc( * Plunge on, using '?' as a fallback character. */ - ch = (Tcl_UniChar) '?'; + ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */ } if (dst > dstEnd) { @@ -3107,10 +3782,10 @@ Iso88591FromUtfProc( static void TableFreeProc( - ClientData clientData) /* TableEncodingData that specifies + void *clientData) /* TableEncodingData that specifies * encoding. */ { - TableEncodingData *dataPtr = (TableEncodingData *) clientData; + TableEncodingData *dataPtr = (TableEncodingData *)clientData; /* * Make sure we aren't freeing twice on shutdown. [Bug 219314] @@ -3142,7 +3817,7 @@ TableFreeProc( static int EscapeToUtfProc( - ClientData clientData, /* EscapeEncodingData that specifies + void *clientData, /* EscapeEncodingData that specifies * encoding. */ const char *src, /* Source string in specified encoding. */ int srcLen, /* Source string length in bytes. */ @@ -3168,13 +3843,14 @@ EscapeToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; const unsigned short *const *tableToUnicode; const Encoding *encodingPtr; int state, result, numChars, charLimit = INT_MAX; const char *dstStart, *dstEnd; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3285,9 +3961,10 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if ((flags & TCL_ENCODING_STOPONERROR) == 0) { + if (!PROFILE_STRICT(flags)) { /* - * Skip the unknown escape sequence. + * Skip the unknown escape sequence. TODO - bug? + * May be replace with UNICODE_REPLACE_CHAR? */ src += longest; @@ -3356,7 +4033,7 @@ EscapeToUtfProc( static int EscapeFromUtfProc( - ClientData clientData, /* EscapeEncodingData that specifies + void *clientData, /* EscapeEncodingData that specifies * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ @@ -3397,6 +4074,7 @@ EscapeFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3460,7 +4138,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (flags & TCL_ENCODING_STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3493,8 +4171,7 @@ EscapeFromUtfProc( result = TCL_CONVERT_NOSPACE; break; } - memcpy(dst, subTablePtr->sequence, - subTablePtr->sequenceLen); + memcpy(dst, subTablePtr->sequence, subTablePtr->sequenceLen); dst += subTablePtr->sequenceLen; } } @@ -3568,7 +4245,7 @@ EscapeFromUtfProc( static void EscapeFreeProc( - ClientData clientData) /* EscapeEncodingData that specifies + void *clientData) /* EscapeEncodingData that specifies * encoding. */ { EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; @@ -3645,7 +4322,7 @@ GetTableEncoding( /* *--------------------------------------------------------------------------- * - * unilen -- + * unilen, unilen4 -- * * A helper function for the Tcl_ExternalToUtf functions. This function * is similar to strlen for double-byte characters: it returns the number @@ -3672,6 +4349,19 @@ unilen( } return (char *) p - src; } + +static size_t +unilen4( + const char *src) +{ + unsigned int *p; + + p = (unsigned int *) src; + while (*p != 0x00000000) { + p++; + } + return (char *) p - src; +} /* *------------------------------------------------------------------------- @@ -3699,11 +4389,11 @@ unilen( static void InitializeEncodingSearchPath( char **valuePtr, - int *lengthPtr, + TCL_HASH_TYPE *lengthPtr, Tcl_Encoding *encodingPtr) { const char *bytes; - int i, numDirs, numBytes; + Tcl_Size i, numDirs, numBytes; Tcl_Obj *libPathObj, *encodingObj, *searchPathObj; TclNewLiteralStringObj(encodingObj, "encoding"); @@ -3712,7 +4402,7 @@ InitializeEncodingSearchPath( Tcl_IncrRefCount(searchPathObj); libPathObj = TclGetLibraryPath(); Tcl_IncrRefCount(libPathObj); - TclListObjLength(NULL, libPathObj, &numDirs); + TclListObjLengthM(NULL, libPathObj, &numDirs); for (i = 0; i < numDirs; i++) { Tcl_Obj *directoryObj, *pathObj; @@ -3733,7 +4423,7 @@ InitializeEncodingSearchPath( if (*encodingPtr) { ((Encoding *)(*encodingPtr))->refCount++; } - bytes = Tcl_GetStringFromObj(searchPathObj, &numBytes); + bytes = TclGetStringFromObj(searchPathObj, &numBytes); *lengthPtr = numBytes; *valuePtr = (char *)ckalloc(numBytes + 1); @@ -3742,6 +4432,166 @@ InitializeEncodingSearchPath( } /* + *------------------------------------------------------------------------ + * + * TclEncodingProfileParseName -- + * + * Maps an encoding profile name to its integer equivalent. + * + * Results: + * TCL_OK on success or TCL_ERROR on failure. + * + * Side effects: + * Returns the profile enum value in *profilePtr + * + *------------------------------------------------------------------------ + */ +int +TclEncodingProfileNameToId( + Tcl_Interp *interp, /* For error messages. May be NULL */ + const char *profileName, /* Name of profile */ + int *profilePtr) /* Output */ +{ + size_t i; + size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + + for (i = 0; i < numProfiles; ++i) { + if (!strcmp(profileName, encodingProfiles[i].name)) { + *profilePtr = encodingProfiles[i].value; + return TCL_OK; + } + } + if (interp) { + Tcl_Obj *errorObj; + /* This code assumes at least two profiles :-) */ + errorObj = + Tcl_ObjPrintf("bad profile name \"%s\": must be", + profileName); + for (i = 0; i < (numProfiles - 1); ++i) { + Tcl_AppendStringsToObj( + errorObj, " ", encodingProfiles[i].name, ",", NULL); + } + Tcl_AppendStringsToObj( + errorObj, " or ", encodingProfiles[numProfiles-1].name, NULL); + + Tcl_SetObjResult(interp, errorObj); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILE", profileName, NULL); + } + return TCL_ERROR; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingProfileValueToName -- + * + * Maps an encoding profile value to its name. + * + * Results: + * Pointer to the name or NULL on failure. Caller must not make + * not modify the string and must make a copy to hold on to it. + * + * Side effects: + * None. + *------------------------------------------------------------------------ + */ +const char * +TclEncodingProfileIdToName( + Tcl_Interp *interp, /* For error messages. May be NULL */ + int profileValue) /* Profile #define value */ +{ + size_t i; + + for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + if (profileValue == encodingProfiles[i].value) { + return encodingProfiles[i].name; + } + } + if (interp) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "Internal error. Bad profile id \"%d\".", + profileValue)); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILEID", NULL); + } + return NULL; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingSetProfileFlags -- + * + * Maps the flags supported in the encoding C API's to internal flags. + * + * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is + * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile + * specified. + * + * If no profile or an invalid profile is specified, it is set to + * the default. + * + * Results: + * Internal encoding flag mask. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------ + */ +int TclEncodingSetProfileFlags(int flags) +{ + if (flags & TCL_ENCODING_STOPONERROR) { + CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT); + } else { + int profile = CHANNEL_PROFILE_GET(flags); + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: + case TCL_ENCODING_PROFILE_STRICT: + case TCL_ENCODING_PROFILE_REPLACE: + break; + case 0: /* Unspecified by caller */ + default: + CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_TCL8); + break; + } + } + return flags; +} + +/* + *------------------------------------------------------------------------ + * + * TclGetEncodingProfiles -- + * + * Get the list of supported encoding profiles. + * + * Results: + * None. + * + * Side effects: + * The list of profile names is stored in the interpreter result. + * + *------------------------------------------------------------------------ + */ +void +TclGetEncodingProfiles(Tcl_Interp *interp) +{ + size_t i, n; + Tcl_Obj *objPtr; + n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + objPtr = Tcl_NewListObj(n, NULL); + for (i = 0; i < n; ++i) { + Tcl_ListObjAppendElement( + interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE)); + } + Tcl_SetObjResult(interp, objPtr); +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 |