diff options
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 2473 |
1 files changed, 754 insertions, 1719 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 39c6ee3..54a49aa 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -3,14 +3,13 @@ * * Contains the implementation of the encoding conversion package. * - * Copyright © 1996-1998 Sun Microsystems, Inc. + * Copyright (c) 1996-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. */ #include "tclInt.h" -#include <assert.h> typedef size_t (LengthProc)(const char *src); @@ -19,7 +18,7 @@ typedef size_t (LengthProc)(const char *src); * convert between various character sets and UTF-8. */ -typedef struct { +typedef struct Encoding { char *name; /* Name of encoding. Malloced because (1) hash * table entry that owns this encoding may be * freed prior to this encoding being freed, @@ -34,22 +33,20 @@ typedef struct { Tcl_EncodingFreeProc *freeProc; /* If non-NULL, function to call when this * encoding is deleted. */ - void *clientData; /* Arbitrary value associated with encoding - * type. Passed to conversion functions. */ - Tcl_Size nullSize; /* Number of 0x00 bytes that signify + int nullSize; /* Number of 0x00 bytes that signify * end-of-string in this encoding. This number * is used to determine the source string * length when the srcLen argument is - * negative. This number can be 1, 2, or 4. */ + * negative. This number can be 1 or 2. */ + ClientData clientData; /* Arbitrary value associated with encoding + * type. Passed to conversion functions. */ LengthProc *lengthProc; /* Function to compute length of * null-terminated strings in this encoding. * If nullSize is 1, this is strlen; if * nullSize is 2, this is a function that * returns the number of bytes in a 0x0000 - * terminated string; if nullSize is 4, this - * is a function that returns the number of - * bytes in a 0x00000000 terminated string. */ - size_t refCount; /* Number of uses of this structure. */ + * terminated string. */ + int refCount; /* Number of uses of this structure. */ Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */ } Encoding; @@ -60,7 +57,7 @@ typedef struct { * encoding. */ -typedef struct { +typedef struct TableEncodingData { int fallback; /* Character (in this encoding) to substitute * when this encoding cannot represent a UTF-8 * character. */ @@ -86,7 +83,7 @@ typedef struct { } TableEncodingData; /* - * Each of the following structures is the clientData for a dynamically-loaded + * The following structures is the clientData for a dynamically-loaded, * escape-driven encoding that is itself comprised of other simpler encodings. * An example is "iso-2022-jp", which uses escape sequences to switch between * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven" @@ -94,8 +91,8 @@ typedef struct { * for switching character sets. */ -typedef struct { - unsigned sequenceLen; /* Length of following string. */ +typedef struct EscapeSubTable { + unsigned int sequenceLen; /* Length of following string. */ char sequence[16]; /* Escape code that marks this encoding. */ char name[32]; /* Name for encoding. */ Encoding *encodingPtr; /* Encoding loaded using above name, or NULL @@ -103,14 +100,14 @@ typedef struct { * yet. */ } EscapeSubTable; -typedef struct { +typedef struct EscapeEncodingData { int fallback; /* Character (in this encoding) to substitute * when this encoding cannot represent a UTF-8 * character. */ - unsigned initLen; /* Length of following string. */ + unsigned int initLen; /* Length of following string. */ char init[16]; /* String to emit or expect before first char * in conversion. */ - unsigned finalLen; /* Length of following string. */ + unsigned int finalLen; /* Length of following string. */ char final[16]; /* String to emit or expect after last char in * conversion. */ char prefixBytes[256]; /* If a byte in the input stream is the first @@ -119,14 +116,14 @@ typedef struct { * entry in this array is 1, otherwise it is * 0. */ int numSubTables; /* Length of following array. */ - EscapeSubTable subTables[TCLFLEXARRAY];/* Information about each EscapeSubTable used - * by this encoding type. The actual size is - * as large as necessary to hold all + EscapeSubTable subTables[1];/* Information about each EscapeSubTable used + * by this encoding type. The actual size will + * be as large as necessary to hold all * EscapeSubTables. */ } EscapeEncodingData; /* - * Constants used when loading an encoding file to identify the type of the + * constants used when loading an encoding file to identify the type of the * file. */ @@ -159,7 +156,7 @@ static ProcessGlobalValue encodingFileMap = { * A list of directories making up the "library path". Historically this * search path has served many uses, but the only one remaining is a base for * the encodingSearchPath above. If the application does not explicitly set - * the encodingSearchPath, then it is initialized by appending /encoding + * the encodingSearchPath, then it will be initialized by appending /encoding * to each directory in this "libraryPath". */ @@ -180,40 +177,12 @@ TCL_DECLARE_MUTEX(encodingMutex) /* * The following are used to hold the default and current system encodings. * If NULL is passed to one of the conversion routines, the current setting of - * the system encoding is used to perform the conversion. + * the system encoding will be used to perform the conversion. */ -static Tcl_Encoding defaultEncoding = NULL; -static Tcl_Encoding systemEncoding = NULL; -Tcl_Encoding tclIdentityEncoding = NULL; -Tcl_Encoding tclUtf8Encoding = NULL; - -/* - * Names of encoding profiles and corresponding integer values. - * Keep alphabetical order for error messages. - */ -static const struct TclEncodingProfiles { - const char *name; - int value; -} encodingProfiles[] = { - {"replace", TCL_ENCODING_PROFILE_REPLACE}, - {"strict", TCL_ENCODING_PROFILE_STRICT}, - {"tcl8", TCL_ENCODING_PROFILE_TCL8}, -}; - -#define PROFILE_STRICT(flags_) \ - ((flags_) & TCL_ENCODING_PROFILE_STRICT) - -#define PROFILE_REPLACE(flags_) \ - ((ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) && !PROFILE_STRICT(flags_)) - -#define PROFILE_TCL8(flags_) \ - ((ENCODING_PROFILE_GET(flags_) != TCL_ENCODING_PROFILE_REPLACE) && !PROFILE_STRICT(flags_)) - -#define UNICODE_REPLACE_CHAR 0xFFFD -#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) -#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) -#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) +static Tcl_Encoding defaultEncoding; +static Tcl_Encoding systemEncoding; +Tcl_Encoding tclIdentityEncoding; /* * The following variable is used in the sparse matrix code for a @@ -226,68 +195,90 @@ static unsigned short emptyPage[256]; * Functions used only in this module. */ -static Tcl_EncodingConvertProc BinaryProc; -static Tcl_DupInternalRepProc DupEncodingInternalRep; -static Tcl_EncodingFreeProc EscapeFreeProc; -static Tcl_EncodingConvertProc EscapeFromUtfProc; -static Tcl_EncodingConvertProc EscapeToUtfProc; -static void FillEncodingFileMap(void); -static void FreeEncoding(Tcl_Encoding encoding); -static Tcl_FreeInternalRepProc FreeEncodingInternalRep; -static Encoding * GetTableEncoding(EscapeEncodingData *dataPtr, - int state); -static Tcl_Encoding LoadEncodingFile(Tcl_Interp *interp, - const char *name); -static Tcl_Encoding LoadTableEncoding(const char *name, int type, - Tcl_Channel chan); -static Tcl_Encoding LoadEscapeEncoding(const char *name, - Tcl_Channel chan); -static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, - const char *name); -static Tcl_EncodingFreeProc TableFreeProc; -static Tcl_EncodingConvertProc TableFromUtfProc; -static Tcl_EncodingConvertProc TableToUtfProc; +static int BinaryProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static void DupEncodingIntRep(Tcl_Obj *srcPtr, Tcl_Obj *dupPtr); +static void EscapeFreeProc(ClientData clientData); +static int EscapeFromUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int EscapeToUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static void FillEncodingFileMap(void); +static void FreeEncoding(Tcl_Encoding encoding); +static void FreeEncodingIntRep(Tcl_Obj *objPtr); +static Encoding * GetTableEncoding(EscapeEncodingData *dataPtr, + int state); +static Tcl_Encoding LoadEncodingFile(Tcl_Interp *interp, const char *name); +static Tcl_Encoding LoadTableEncoding(const char *name, int type, + Tcl_Channel chan); +static Tcl_Encoding LoadEscapeEncoding(const char *name, Tcl_Channel chan); +static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, + const char *name); +static void TableFreeProc(ClientData clientData); +static int TableFromUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int TableToUtfProc(ClientData clientData, const char *src, + int srcLen, int flags, Tcl_EncodingState *statePtr, + char *dst, int dstLen, int *srcReadPtr, + int *dstWrotePtr, int *dstCharsPtr); static size_t unilen(const char *src); -static size_t unilen4(const char *src); -static Tcl_EncodingConvertProc Utf32ToUtfProc; -static Tcl_EncodingConvertProc UtfToUtf32Proc; -static Tcl_EncodingConvertProc Utf16ToUtfProc; -static Tcl_EncodingConvertProc UtfToUtf16Proc; -static Tcl_EncodingConvertProc UtfToUcs2Proc; -static Tcl_EncodingConvertProc UtfToUtfProc; -static Tcl_EncodingConvertProc Iso88591FromUtfProc; -static Tcl_EncodingConvertProc Iso88591ToUtfProc; - +static int UnicodeToUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int UtfToUnicodeProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int UtfToUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr, int pureNullMode); +static int UtfIntToUtfExtProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int UtfExtToUtfIntProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int Iso88591FromUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int Iso88591ToUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, + int dstLen, int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field - * of the internalrep. This should help the lifetime of encodings be more useful. + * of the intrep. This should help the lifetime of encodings be more useful. * See concerns raised in [Bug 1077262]. */ -static const Tcl_ObjType encodingType = { - "encoding", - FreeEncodingInternalRep, - DupEncodingInternalRep, - NULL, - NULL +static Tcl_ObjType encodingType = { + "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL }; - -#define EncodingSetInternalRep(objPtr, encoding) \ - do { \ - Tcl_ObjInternalRep ir; \ - ir.twoPtrValue.ptr1 = (encoding); \ - ir.twoPtrValue.ptr2 = NULL; \ - Tcl_StoreInternalRep((objPtr), &encodingType, &ir); \ - } while (0) - -#define EncodingGetInternalRep(objPtr, encoding) \ - do { \ - const Tcl_ObjInternalRep *irPtr; \ - irPtr = TclFetchInternalRep ((objPtr), &encodingType); \ - (encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL; \ - } while (0) - /* *---------------------------------------------------------------------- @@ -303,7 +294,7 @@ static const Tcl_ObjType encodingType = { * Standard Tcl return code. * * Side effects: - * Caches the Tcl_Encoding value as the internal rep of (*objPtr). + * Caches the Tcl_Encoding value as the internal rep of (*objPtr). * *---------------------------------------------------------------------- */ @@ -314,16 +305,16 @@ Tcl_GetEncodingFromObj( Tcl_Obj *objPtr, Tcl_Encoding *encodingPtr) { - Tcl_Encoding encoding; - const char *name = TclGetString(objPtr); + const char *name = Tcl_GetString(objPtr); + if (objPtr->typePtr != &encodingType) { + Tcl_Encoding encoding = Tcl_GetEncoding(interp, name); - EncodingGetInternalRep(objPtr, encoding); - if (encoding == NULL) { - encoding = Tcl_GetEncoding(interp, name); if (encoding == NULL) { return TCL_ERROR; } - EncodingSetInternalRep(objPtr, encoding); + TclFreeIntRep(objPtr); + objPtr->internalRep.twoPtrValue.ptr1 = (VOID *) encoding; + objPtr->typePtr = &encodingType; } *encodingPtr = Tcl_GetEncoding(NULL, name); return TCL_OK; @@ -332,7 +323,7 @@ Tcl_GetEncodingFromObj( /* *---------------------------------------------------------------------- * - * FreeEncodingInternalRep -- + * FreeEncodingIntRep -- * * The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType. * @@ -340,19 +331,17 @@ Tcl_GetEncodingFromObj( */ static void -FreeEncodingInternalRep( +FreeEncodingIntRep( Tcl_Obj *objPtr) { - Tcl_Encoding encoding; - - EncodingGetInternalRep(objPtr, encoding); - Tcl_FreeEncoding(encoding); + Tcl_FreeEncoding((Tcl_Encoding) objPtr->internalRep.twoPtrValue.ptr1); + objPtr->typePtr = NULL; } /* *---------------------------------------------------------------------- * - * DupEncodingInternalRep -- + * DupEncodingIntRep -- * * The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType. * @@ -360,12 +349,12 @@ FreeEncodingInternalRep( */ static void -DupEncodingInternalRep( +DupEncodingIntRep( Tcl_Obj *srcPtr, Tcl_Obj *dupPtr) { - Tcl_Encoding encoding = Tcl_GetEncoding(NULL, TclGetString(srcPtr)); - EncodingSetInternalRep(dupPtr, encoding); + dupPtr->internalRep.twoPtrValue.ptr1 = (VOID *) + Tcl_GetEncoding(NULL, srcPtr->bytes); } /* @@ -403,9 +392,9 @@ int Tcl_SetEncodingSearchPath( Tcl_Obj *searchPath) { - Tcl_Size dummy; + int dummy; - if (TCL_ERROR == TclListObjLength(NULL, searchPath, &dummy)) { + if (TCL_ERROR == Tcl_ListObjLength(NULL, searchPath, &dummy)) { return TCL_ERROR; } TclSetProcessGlobalValue(&encodingSearchPath, searchPath, NULL); @@ -440,8 +429,9 @@ TclGetLibraryPath(void) * Keeps the per-thread copy of the library path current with changes to * the global copy. * - * Since the result of this routine is void, if searchPath is not a valid - * list this routine silently does nothing. + * NOTE: this routine returns void, so there's no way to report the error + * that searchPath is not a valid list. In that case, this routine will + * silently do nothing. * *---------------------------------------------------------------------- */ @@ -450,9 +440,9 @@ void TclSetLibraryPath( Tcl_Obj *path) { - Tcl_Size dummy; + int dummy; - if (TCL_ERROR == TclListObjLength(NULL, path, &dummy)) { + if (TCL_ERROR == Tcl_ListObjLength(NULL, path, &dummy)) { return; } TclSetProcessGlobalValue(&libraryPath, path, NULL); @@ -463,16 +453,17 @@ TclSetLibraryPath( * * FillEncodingFileMap -- * - * Called to update the encoding file map with the current value - * of the encoding search path. + * Called to bring the encoding file map in sync with the current value + * of the encoding search path. * - * Finds *.end files in the directories on the encoding search path and - * stores the found pathnames in a map associated with the encoding name. + * Scan the directories on the encoding search path, find the *.enc + * files, and store the found pathnames in a map associated with the + * encoding name. * - * If $dir is on the encoding search path and the file $dir/foo.enc is - * found, stores a "foo" -> $dir entry in the map. if the "foo" encoding - * is needed later, the $dir/foo.enc name can be quickly constructed in - * order to read the encoding data. + * In particular, if $dir is on the encoding search path, and the file + * $dir/foo.enc is found, then store a "foo" -> $dir entry in the map. + * Later, any need for the "foo" encoding will quickly * be able to + * construct the $dir/foo.enc pathname for reading the encoding data. * * Results: * None. @@ -486,43 +477,42 @@ TclSetLibraryPath( static void FillEncodingFileMap(void) { - Tcl_Size i, numDirs = 0; + int i, numDirs = 0; Tcl_Obj *map, *searchPath; searchPath = Tcl_GetEncodingSearchPath(); Tcl_IncrRefCount(searchPath); - TclListObjLength(NULL, searchPath, &numDirs); + Tcl_ListObjLength(NULL, searchPath, &numDirs); map = Tcl_NewDictObj(); Tcl_IncrRefCount(map); - for (i = numDirs-1; i != TCL_INDEX_NONE; i--) { + for (i = numDirs-1; i >= 0; i--) { /* * Iterate backwards through the search path so as we overwrite * entries found, we favor files earlier on the search path. */ - Tcl_Size j, numFiles; - Tcl_Obj *directory, *matchFileList; + int j, numFiles; + Tcl_Obj *directory, *matchFileList = Tcl_NewObj(); Tcl_Obj **filev; Tcl_GlobTypeData readableFiles = { TCL_GLOB_TYPE_FILE, TCL_GLOB_PERM_R, NULL, NULL }; - TclNewObj(matchFileList); Tcl_ListObjIndex(NULL, searchPath, i, &directory); Tcl_IncrRefCount(directory); Tcl_IncrRefCount(matchFileList); Tcl_FSMatchInDirectory(NULL, matchFileList, directory, "*.enc", &readableFiles); - TclListObjGetElements(NULL, matchFileList, &numFiles, &filev); + Tcl_ListObjGetElements(NULL, matchFileList, &numFiles, &filev); for (j=0; j<numFiles; j++) { - Tcl_Obj *encodingName, *fileObj; + Tcl_Obj *encodingName, *file; - fileObj = TclPathPart(NULL, filev[j], TCL_PATH_TAIL); - encodingName = TclPathPart(NULL, fileObj, TCL_PATH_ROOT); + file = TclPathPart(NULL, filev[j], TCL_PATH_TAIL); + encodingName = TclPathPart(NULL, file, TCL_PATH_ROOT); Tcl_DictObjPut(NULL, map, encodingName, directory); - Tcl_DecrRefCount(fileObj); + Tcl_DecrRefCount(file); Tcl_DecrRefCount(encodingName); } Tcl_DecrRefCount(matchFileList); @@ -550,113 +540,51 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* - * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS - * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this - * when adding bits. TODO - should really be defined in a single file. - * - * To prevent conflicting bits, only define bits within 0xff00 mask here. - */ -#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */ -#define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ -#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ - void TclInitEncodingSubsystem(void) { Tcl_EncodingType type; - TableEncodingData *dataPtr; - unsigned size; - unsigned short i; - union { - char c; - short s; - } isLe; - int leFlags; if (encodingsInitialized) { return; } - /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */ - isLe.s = 1; - leFlags = isLe.c ? TCL_ENCODING_LE : 0; - Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); /* - * Create a few initial encodings. UTF-8 to UTF-8 translation is not a - * no-op because it turns a stream of improperly formed UTF-8 into a - * properly formed stream. + * Create a few initial encodings. Note that the UTF-8 to UTF-8 + * translation is not a no-op, because it will turn a stream of improperly + * formed UTF-8 into a properly formed stream. */ - type.encodingName = NULL; + type.encodingName = "identity"; type.toUtfProc = BinaryProc; type.fromUtfProc = BinaryProc; type.freeProc = NULL; type.nullSize = 1; type.clientData = NULL; - tclIdentityEncoding = Tcl_CreateEncoding(&type); + + defaultEncoding = Tcl_CreateEncoding(&type); + tclIdentityEncoding = Tcl_GetEncoding(NULL, type.encodingName); + systemEncoding = Tcl_GetEncoding(NULL, type.encodingName); type.encodingName = "utf-8"; - type.toUtfProc = UtfToUtfProc; - type.fromUtfProc = UtfToUtfProc; + type.toUtfProc = UtfExtToUtfIntProc; + type.fromUtfProc = UtfIntToUtfExtProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = INT2PTR(ENCODING_UTF); - tclUtf8Encoding = Tcl_CreateEncoding(&type); type.clientData = NULL; - type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); - type.toUtfProc = Utf16ToUtfProc; - type.fromUtfProc = UtfToUcs2Proc; - type.freeProc = NULL; - type.nullSize = 2; - type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(TCL_ENCODING_LE); - Tcl_CreateEncoding(&type); - type.encodingName = "ucs-2be"; - type.clientData = NULL; - Tcl_CreateEncoding(&type); - type.encodingName = "ucs-2"; - type.clientData = INT2PTR(leFlags); - Tcl_CreateEncoding(&type); - - type.toUtfProc = Utf32ToUtfProc; - type.fromUtfProc = UtfToUtf32Proc; - type.freeProc = NULL; - type.nullSize = 4; - type.encodingName = "utf-32le"; - type.clientData = INT2PTR(TCL_ENCODING_LE); - Tcl_CreateEncoding(&type); - type.encodingName = "utf-32be"; - type.clientData = NULL; - Tcl_CreateEncoding(&type); - type.encodingName = "utf-32"; - type.clientData = INT2PTR(leFlags); - Tcl_CreateEncoding(&type); - - type.toUtfProc = Utf16ToUtfProc; - type.fromUtfProc = UtfToUtf16Proc; + type.encodingName = "unicode"; + type.toUtfProc = UnicodeToUtfProc; + type.fromUtfProc = UtfToUnicodeProc; type.freeProc = NULL; type.nullSize = 2; - type.encodingName = "utf-16le"; - type.clientData = INT2PTR(TCL_ENCODING_LE); - Tcl_CreateEncoding(&type); - type.encodingName = "utf-16be"; type.clientData = NULL; Tcl_CreateEncoding(&type); - type.encodingName = "utf-16"; - type.clientData = INT2PTR(leFlags); - Tcl_CreateEncoding(&type); - -#ifndef TCL_NO_DEPRECATED - type.encodingName = "unicode"; - Tcl_CreateEncoding(&type); -#endif /* * Need the iso8859-1 encoding in order to process binary data, so force @@ -665,36 +593,42 @@ TclInitEncodingSubsystem(void) * code to duplicate the structure of a table encoding here. */ - dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData)); - memset(dataPtr, 0, sizeof(TableEncodingData)); - dataPtr->fallback = '?'; + { + TableEncodingData *dataPtr = (TableEncodingData *) + ckalloc(sizeof(TableEncodingData)); + unsigned size; + unsigned short i; - size = 256*(sizeof(unsigned short *) + sizeof(unsigned short)); - dataPtr->toUnicode = (unsigned short **)ckalloc(size); - memset(dataPtr->toUnicode, 0, size); - dataPtr->fromUnicode = (unsigned short **)ckalloc(size); - memset(dataPtr->fromUnicode, 0, size); + memset(dataPtr, 0, sizeof(TableEncodingData)); + dataPtr->fallback = '?'; - dataPtr->toUnicode[0] = (unsigned short *) (dataPtr->toUnicode + 256); - dataPtr->fromUnicode[0] = (unsigned short *) (dataPtr->fromUnicode + 256); - for (i=1 ; i<256 ; i++) { - dataPtr->toUnicode[i] = emptyPage; - dataPtr->fromUnicode[i] = emptyPage; - } + size = 256*(sizeof(unsigned short *) + sizeof(unsigned short)); + dataPtr->toUnicode = (unsigned short **) ckalloc(size); + memset(dataPtr->toUnicode, 0, size); + dataPtr->fromUnicode = (unsigned short **) ckalloc(size); + memset(dataPtr->fromUnicode, 0, size); - for (i=0 ; i<256 ; i++) { - dataPtr->toUnicode[0][i] = i; - dataPtr->fromUnicode[0][i] = i; - } + dataPtr->toUnicode[0] = (unsigned short *) (dataPtr->toUnicode + 256); + dataPtr->fromUnicode[0] = (unsigned short *) + (dataPtr->fromUnicode + 256); + for (i=1 ; i<256 ; i++) { + dataPtr->toUnicode[i] = emptyPage; + dataPtr->fromUnicode[i] = emptyPage; + } - type.encodingName = "iso8859-1"; - type.toUtfProc = Iso88591ToUtfProc; - type.fromUtfProc = Iso88591FromUtfProc; - type.freeProc = TableFreeProc; - type.nullSize = 1; - type.clientData = dataPtr; - defaultEncoding = Tcl_CreateEncoding(&type); - systemEncoding = Tcl_GetEncoding(NULL, type.encodingName); + for (i=0 ; i<256 ; i++) { + dataPtr->toUnicode[0][i] = i; + dataPtr->fromUnicode[0][i] = i; + } + + type.encodingName = "iso8859-1"; + type.toUtfProc = Iso88591ToUtfProc; + type.fromUtfProc = Iso88591FromUtfProc; + type.freeProc = TableFreeProc; + type.nullSize = 1; + type.clientData = dataPtr; + Tcl_CreateEncoding(&type); + } encodingsInitialized = 1; } @@ -724,12 +658,7 @@ TclFinalizeEncodingSubsystem(void) Tcl_MutexLock(&encodingMutex); encodingsInitialized = 0; FreeEncoding(systemEncoding); - systemEncoding = NULL; - defaultEncoding = NULL; FreeEncoding(tclIdentityEncoding); - tclIdentityEncoding = NULL; - FreeEncoding(tclUtf8Encoding); - tclUtf8Encoding = NULL; hPtr = Tcl_FirstHashEntry(&encodingTable, &search); while (hPtr != NULL) { @@ -740,7 +669,7 @@ TclFinalizeEncodingSubsystem(void) * cleaned up. */ - FreeEncoding((Tcl_Encoding)Tcl_GetHashValue(hPtr)); + FreeEncoding((Tcl_Encoding) Tcl_GetHashValue(hPtr)); hPtr = Tcl_FirstHashEntry(&encodingTable, &search); } @@ -753,33 +682,32 @@ TclFinalizeEncodingSubsystem(void) * * Tcl_GetDefaultEncodingDir -- * - * Legacy public interface to retrieve first directory in the encoding - * searchPath. + * Legacy public interface to retrieve first directory in the encoding + * searchPath. * * Results: * The directory pathname, as a string, or NULL for an empty encoding * search path. * * Side effects: - * None. + * None. * *------------------------------------------------------------------------- */ -#if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9 const char * Tcl_GetDefaultEncodingDir(void) { int numDirs; Tcl_Obj *first, *searchPath = Tcl_GetEncodingSearchPath(); - TclListObjLength(NULL, searchPath, &numDirs); + Tcl_ListObjLength(NULL, searchPath, &numDirs); if (numDirs == 0) { return NULL; } Tcl_ListObjIndex(NULL, searchPath, 0, &first); - return TclGetString(first); + return Tcl_GetString(first); } /* @@ -787,14 +715,14 @@ Tcl_GetDefaultEncodingDir(void) * * Tcl_SetDefaultEncodingDir -- * - * Legacy public interface to set the first directory in the encoding - * search path. + * Legacy public interface to set the first directory in the encoding + * search path. * * Results: - * None. + * None. * * Side effects: - * Modifies the encoding search path. + * Modifies the encoding search path. * *------------------------------------------------------------------------- */ @@ -804,13 +732,12 @@ Tcl_SetDefaultEncodingDir( const char *path) { Tcl_Obj *searchPath = Tcl_GetEncodingSearchPath(); - Tcl_Obj *directory = Tcl_NewStringObj(path, TCL_INDEX_NONE); + Tcl_Obj *directory = Tcl_NewStringObj(path, -1); searchPath = Tcl_DuplicateObj(searchPath); Tcl_ListObjReplace(NULL, searchPath, 0, 0, 1, &directory); Tcl_SetEncodingSearchPath(searchPath); } -#endif /* *------------------------------------------------------------------------- @@ -828,7 +755,11 @@ Tcl_SetDefaultEncodingDir( * interp was NULL. * * Side effects: - * LoadEncodingFile is called if necessary. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *------------------------------------------------------------------------- */ @@ -851,7 +782,7 @@ Tcl_GetEncoding( hPtr = Tcl_FindHashEntry(&encodingTable, name); if (hPtr != NULL) { - encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); + encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); encodingPtr->refCount++; Tcl_MutexUnlock(&encodingMutex); return (Tcl_Encoding) encodingPtr; @@ -866,15 +797,15 @@ Tcl_GetEncoding( * * Tcl_FreeEncoding -- * - * Releases an encoding allocated by Tcl_CreateEncoding() or - * Tcl_GetEncoding(). + * This function is called to release an encoding allocated by + * Tcl_CreateEncoding() or Tcl_GetEncoding(). * * Results: * None. * * Side effects: * The reference count associated with the encoding is decremented and - * the encoding is deleted if nothing is using it anymore. + * the encoding may be deleted if nothing is using it anymore. * *--------------------------------------------------------------------------- */ @@ -893,14 +824,13 @@ Tcl_FreeEncoding( * * FreeEncoding -- * - * Decrements the reference count of an encoding. The caller must hold - * encodingMutes. + * This function is called to release an encoding by functions that + * already have the encodingMutex. * * Results: * None. * * Side effects: - * Releases the resource for an encoding if it is now unused. * The reference count associated with the encoding is decremented and * the encoding may be deleted if nothing is using it anymore. * @@ -911,22 +841,25 @@ static void FreeEncoding( Tcl_Encoding encoding) { - Encoding *encodingPtr = (Encoding *) encoding; + Encoding *encodingPtr; + encodingPtr = (Encoding *) encoding; if (encodingPtr == NULL) { return; } - if (encodingPtr->refCount-- <= 1) { + if (encodingPtr->refCount<=0) { + Tcl_Panic("FreeEncoding: refcount problem !!!"); + } + encodingPtr->refCount--; + if (encodingPtr->refCount == 0) { if (encodingPtr->freeProc != NULL) { - encodingPtr->freeProc(encodingPtr->clientData); + (*encodingPtr->freeProc)(encodingPtr->clientData); } if (encodingPtr->hPtr != NULL) { Tcl_DeleteHashEntry(encodingPtr->hPtr); } - if (encodingPtr->name) { - ckfree(encodingPtr->name); - } - ckfree(encodingPtr); + ckfree((char *) encodingPtr->name); + ckfree((char *) encodingPtr); } } @@ -935,7 +868,7 @@ FreeEncoding( * * Tcl_GetEncodingName -- * - * Given an encoding, return the name that was used to construct the + * Given an encoding, return the name that was used to constuct the * encoding. * * Results: @@ -983,11 +916,10 @@ Tcl_GetEncodingNames( Tcl_HashTable table; Tcl_HashSearch search; Tcl_HashEntry *hPtr; - Tcl_Obj *map, *name, *result; + Tcl_Obj *map, *name, *result = Tcl_NewObj(); Tcl_DictSearch mapSearch; int dummy, done = 0; - TclNewObj(result); Tcl_InitObjHashTable(&table); /* @@ -997,10 +929,9 @@ Tcl_GetEncodingNames( Tcl_MutexLock(&encodingMutex); for (hPtr = Tcl_FirstHashEntry(&encodingTable, &search); hPtr != NULL; hPtr = Tcl_NextHashEntry(&search)) { - Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); - + Encoding *encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); Tcl_CreateHashEntry(&table, - Tcl_NewStringObj(encodingPtr->name, TCL_INDEX_NONE), &dummy); + (char *) Tcl_NewStringObj(encodingPtr->name, -1), &dummy); } Tcl_MutexUnlock(&encodingMutex); @@ -1013,7 +944,7 @@ Tcl_GetEncodingNames( Tcl_DictObjFirst(NULL, map, &mapSearch, &name, NULL, &done); for (; !done; Tcl_DictObjNext(&mapSearch, &name, NULL, &done)) { - Tcl_CreateHashEntry(&table, name, &dummy); + Tcl_CreateHashEntry(&table, (char *) name, &dummy); } /* @@ -1030,33 +961,6 @@ Tcl_GetEncodingNames( } /* - *------------------------------------------------------------------------- - * - * Tcl_GetEncodingNulLength -- - * - * Given an encoding, return the number of nul bytes used for the - * string termination. - * - * Results: - * The number of nul bytes used for the string termination. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ -Tcl_Size -Tcl_GetEncodingNulLength( - Tcl_Encoding encoding) -{ - if (encoding == NULL) { - encoding = systemEncoding; - } - - return ((Encoding *) encoding)->nullSize; -} - -/* *------------------------------------------------------------------------ * * Tcl_SetSystemEncoding -- @@ -1075,7 +979,7 @@ Tcl_GetEncodingNulLength( * Side effects: * The reference count of the new system encoding is incremented. The * reference count of the old system encoding is decremented and it may - * be freed. All VFS cached information is invalidated. + * be freed. * *------------------------------------------------------------------------ */ @@ -1106,7 +1010,6 @@ Tcl_SetSystemEncoding( FreeEncoding(systemEncoding); systemEncoding = encoding; Tcl_MutexUnlock(&encodingMutex); - Tcl_FSMountsChanged(NULL); return TCL_OK; } @@ -1116,22 +1019,23 @@ Tcl_SetSystemEncoding( * * Tcl_CreateEncoding -- * - * Defines a new encoding, along with the functions that are used to - * convert to and from Unicode. + * This function is called to define a new encoding and the functions + * that are used to convert between the specified encoding and Unicode. * * Results: * Returns a token that represents the encoding. If an encoding with the * same name already existed, the old encoding token remains valid and - * continues to behave as it used to, and is eventually garbage collected - * when the last reference to it goes away. Any subsequent calls to - * Tcl_GetEncoding with the specified name retrieve the most recent - * encoding token. + * continues to behave as it used to, and will eventually be garbage + * collected when the last reference to it goes away. Any subsequent + * calls to Tcl_GetEncoding with the specified name will retrieve the + * most recent encoding token. * * Side effects: - * A new record having the name of the encoding is entered into a table of - * encodings visible to all interpreters. For each call to this function, - * there should eventually be a call to Tcl_FreeEncoding, which cleans - * deletes the record in the table when an encoding is no longer needed. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *--------------------------------------------------------------------------- */ @@ -1141,26 +1045,9 @@ Tcl_CreateEncoding( const Tcl_EncodingType *typePtr) /* The encoding type. */ { - Encoding *encodingPtr = (Encoding *)ckalloc(sizeof(Encoding)); - encodingPtr->name = NULL; - encodingPtr->toUtfProc = typePtr->toUtfProc; - encodingPtr->fromUtfProc = typePtr->fromUtfProc; - encodingPtr->freeProc = typePtr->freeProc; - encodingPtr->nullSize = typePtr->nullSize; - encodingPtr->clientData = typePtr->clientData; - if (typePtr->nullSize == 2) { - encodingPtr->lengthProc = (LengthProc *) unilen; - } else if (typePtr->nullSize == 4) { - encodingPtr->lengthProc = (LengthProc *) unilen4; - } else { - encodingPtr->lengthProc = (LengthProc *) strlen; - } - encodingPtr->refCount = 1; - encodingPtr->hPtr = NULL; - - if (typePtr->encodingName) { Tcl_HashEntry *hPtr; int isNew; + Encoding *encodingPtr; char *name; Tcl_MutexLock(&encodingMutex); @@ -1171,17 +1058,30 @@ Tcl_CreateEncoding( * reference goes away. */ - Encoding *replaceMe = (Encoding *)Tcl_GetHashValue(hPtr); - replaceMe->hPtr = NULL; + encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); + encodingPtr->hPtr = NULL; } - name = (char *)ckalloc(strlen(typePtr->encodingName) + 1); + name = ckalloc((unsigned) strlen(typePtr->encodingName) + 1); + + encodingPtr = (Encoding *) ckalloc(sizeof(Encoding)); encodingPtr->name = strcpy(name, typePtr->encodingName); + encodingPtr->toUtfProc = typePtr->toUtfProc; + encodingPtr->fromUtfProc = typePtr->fromUtfProc; + encodingPtr->freeProc = typePtr->freeProc; + encodingPtr->nullSize = typePtr->nullSize; + encodingPtr->clientData = typePtr->clientData; + if (typePtr->nullSize == 1) { + encodingPtr->lengthProc = (LengthProc *) strlen; + } else { + encodingPtr->lengthProc = (LengthProc *) unilen; + } + encodingPtr->refCount = 1; encodingPtr->hPtr = hPtr; Tcl_SetHashValue(hPtr, encodingPtr); Tcl_MutexUnlock(&encodingMutex); - } + return (Tcl_Encoding) encodingPtr; } @@ -1211,151 +1111,46 @@ Tcl_ExternalToUtfDString( Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ - Tcl_Size srcLen, /* Source string length in bytes, or < 0 for + int srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_ExternalToUtfDStringEx( - NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); - return Tcl_DStringValue(dstPtr); -} - - -/* - *------------------------------------------------------------------------- - * - * Tcl_ExternalToUtfDStringEx -- - * - * Convert a source buffer from the specified encoding into UTF-8. - * "flags" controls the behavior if any of the bytes in - * the source buffer are invalid or cannot be represented in utf-8. - * Possible flags values: - * target encoding. It should be composed by OR-ing the following: - * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} - * - * Results: - * The return value is one of - * TCL_OK: success. Converted string in *dstPtr - * TCL_ERROR: error in passed parameters. Error message in interp - * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence - * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition - * TCL_CONVERT_UNKNOWN: source contained a character that could not - * be represented in target encoding. - * - * Side effects: - * - * TCL_OK: The converted bytes are stored in the DString and NUL - * terminated in an encoding-specific manner. - * TCL_ERROR: an error, message is stored in the interp if not NULL. - * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored - * in the interpreter (if not NULL). If errorLocPtr is not NULL, - * no error message is stored as it is expected the caller is - * interested in whatever is decoded so far and not treating this - * as an error condition. - * - * In addition, *dstPtr is always initialized and must be cleared - * by the caller irrespective of the return code. - * - *------------------------------------------------------------------------- - */ - -int -Tcl_ExternalToUtfDStringEx( - Tcl_Interp *interp, /* For error messages. May be NULL. */ - Tcl_Encoding encoding, /* The encoding for the source string, or NULL - * for the default system encoding. */ - const char *src, /* Source string in specified encoding. */ - Tcl_Size srcLen, /* Source string length in bytes, or < 0 for - * encoding-specific string length. */ - int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr, /* Uninitialized or free DString in which the - * converted string is stored. */ - Tcl_Size *errorLocPtr) /* Where to store the error location - (or TCL_INDEX_NONE if no error). May - be NULL. */ -{ char *dst; Tcl_EncodingState state; - const Encoding *encodingPtr; - int result, srcRead, dstWrote, dstChars; - Tcl_Size dstLen, soFar; - const char *srcStart = src; + Encoding *encodingPtr; + int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; - /* DO FIRST - Must always be initialized before returning */ Tcl_DStringInit(dstPtr); - - if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { - /* TODO - what other flags are illegal? - See TIP 656 */ - Tcl_SetObjResult( - interp, - Tcl_NewStringObj( - "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", - TCL_INDEX_NONE)); - Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", (void *)NULL); - errno = EINVAL; - return TCL_ERROR; - } - dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; if (encoding == NULL) { encoding = systemEncoding; } - encodingPtr = (Encoding *)encoding; + encodingPtr = (Encoding *) encoding; if (src == NULL) { srcLen = 0; } else if (srcLen < 0) { - srcLen = encodingPtr->lengthProc(src); + srcLen = (*encodingPtr->lengthProc)(src); } - flags |= TCL_ENCODING_START | TCL_ENCODING_END; - if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= ENCODING_INPUT; - } + flags = TCL_ENCODING_START | TCL_ENCODING_END; while (1) { - result = encodingPtr->toUtfProc(encodingPtr->clientData, src, - srcLen, flags, &state, dst, dstLen, - &srcRead, &dstWrote, &dstChars); + result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, + srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, + &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); - src += srcRead; if (result != TCL_CONVERT_NOSPACE) { - Tcl_Size nBytesProcessed = (src - srcStart); - Tcl_DStringSetLength(dstPtr, soFar); - if (errorLocPtr) { - /* - * Do not write error message into interpreter if caller - * wants to know error location. - */ - *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; - } else { - /* Caller wants error message on failure */ - if (result != TCL_OK && interp != NULL) { - char buf[TCL_INTEGER_SPACE]; - snprintf(buf, sizeof(buf), "%" TCL_SIZE_MODIFIER "d", nBytesProcessed); - Tcl_SetObjResult( - interp, - Tcl_ObjPrintf("unexpected byte sequence starting at index %" - TCL_SIZE_MODIFIER "d: '\\x%02X'", - nBytesProcessed, - UCHAR(srcStart[nBytesProcessed]))); - Tcl_SetErrorCode( - interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, (void *)NULL); - } - } - if (result != TCL_OK) { - errno = (result == TCL_CONVERT_NOSPACE) ? ENOMEM : EILSEQ; - } - return result; + return Tcl_DStringValue(dstPtr); } - /* Expand space and continue */ flags &= ~TCL_ENCODING_START; + src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -1386,11 +1181,11 @@ Tcl_ExternalToUtfDStringEx( int Tcl_ExternalToUtf( - TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */ + Tcl_Interp *interp, /* Interp for error return, if not NULL. */ Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ - Tcl_Size srcLen, /* Source string length in bytes, or < 0 for + int srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state @@ -1400,7 +1195,7 @@ Tcl_ExternalToUtf( * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ - Tcl_Size dstLen, /* The maximum length of output buffer in + int dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the * source string that were converted. This may @@ -1414,11 +1209,8 @@ Tcl_ExternalToUtf( * correspond to the bytes stored in the * output buffer. */ { - const Encoding *encodingPtr; - int result, srcRead, dstWrote, dstChars = 0; - int noTerminate = flags & TCL_ENCODING_NO_TERMINATE; - int charLimited = (flags & TCL_ENCODING_CHAR_LIMIT) && dstCharsPtr; - int maxChars = INT_MAX; + Encoding *encodingPtr; + int result, srcRead, dstWrote, dstChars; Tcl_EncodingState state; if (encoding == NULL) { @@ -1429,7 +1221,7 @@ Tcl_ExternalToUtf( if (src == NULL) { srcLen = 0; } else if (srcLen < 0) { - srcLen = encodingPtr->lengthProc(src); + srcLen = (*encodingPtr->lengthProc)(src); } if (statePtr == NULL) { flags |= TCL_ENCODING_START | TCL_ENCODING_END; @@ -1443,48 +1235,19 @@ Tcl_ExternalToUtf( } if (dstCharsPtr == NULL) { dstCharsPtr = &dstChars; - flags &= ~TCL_ENCODING_CHAR_LIMIT; - } else if (charLimited) { - maxChars = *dstCharsPtr; } - if (!noTerminate) { - if (dstLen < 1) { - return TCL_CONVERT_NOSPACE; - } - /* - * If there are any null characters in the middle of the buffer, - * they will converted to the UTF-8 null character (\xC0\x80). To get - * the actual \0 at the end of the destination buffer, we need to - * append it manually. First make room for it... - */ - - dstLen--; - } else { - if (dstLen < 0) { - return TCL_CONVERT_NOSPACE; - } - } - if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= ENCODING_INPUT; - } - do { - Tcl_EncodingState savedState = *statePtr; - - result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, - dstCharsPtr); - if (*dstCharsPtr <= maxChars) { - break; - } - dstLen = TclUtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); - *statePtr = savedState; - } while (1); - if (!noTerminate) { - /* ...and then append it */ + /* + * If there are any null characters in the middle of the buffer, they will + * converted to the UTF-8 null character (\xC080). To get the actual \0 at + * the end of the destination buffer, we need to append it manually. + */ - dst[*dstWrotePtr] = '\0'; - } + dstLen--; + result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, srcLen, + flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + dstCharsPtr); + dst[*dstWrotePtr] = '\0'; return result; } @@ -1494,9 +1257,10 @@ Tcl_ExternalToUtf( * * Tcl_UtfToExternalDString -- * - * Convert a source buffer from UTF-8 to the specified encoding. If any + * Convert a source buffer from UTF-8 into the specified encoding. If any * of the bytes in the source buffer are invalid or cannot be represented - * in the target encoding, a default fallback character is substituted. + * in the target encoding, a default fallback character will be + * substituted. * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1514,91 +1278,17 @@ Tcl_UtfToExternalDString( Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ - Tcl_Size srcLen, /* Source string length in bytes, or < 0 for + int srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_UtfToExternalDStringEx( - NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); - return Tcl_DStringValue(dstPtr); -} - - -/* - *------------------------------------------------------------------------- - * - * Tcl_UtfToExternalDStringEx -- - * - * Convert a source buffer from UTF-8 to the specified encoding. - * The parameter flags controls the behavior, if any of the bytes in - * the source buffer are invalid or cannot be represented in the - * target encoding. It should be composed by OR-ing the following: - * - *At most one* of TCL_ENCODING_PROFILE_* - * - * Results: - * The return value is one of - * TCL_OK: success. Converted string in *dstPtr - * TCL_ERROR: error in passed parameters. Error message in interp - * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence - * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition - * TCL_CONVERT_UNKNOWN: source contained a character that could not - * be represented in target encoding. - * - * Side effects: - * - * TCL_OK: The converted bytes are stored in the DString and NUL - * terminated in an encoding-specific manner - * TCL_ERROR: an error, message is stored in the interp if not NULL. - * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored - * in the interpreter (if not NULL). If errorLocPtr is not NULL, - * no error message is stored as it is expected the caller is - * interested in whatever is decoded so far and not treating this - * as an error condition. - * - * In addition, *dstPtr is always initialized and must be cleared - * by the caller irrespective of the return code. - * - *------------------------------------------------------------------------- - */ - -int -Tcl_UtfToExternalDStringEx( - Tcl_Interp *interp, /* For error messages. May be NULL. */ - Tcl_Encoding encoding, /* The encoding for the converted string, or - * NULL for the default system encoding. */ - const char *src, /* Source string in UTF-8. */ - Tcl_Size srcLen, /* Source string length in bytes, or < 0 for - * strlen(). */ - int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr, /* Uninitialized or free DString in which the - * converted string is stored. */ - Tcl_Size *errorLocPtr) /* Where to store the error location - (or TCL_INDEX_NONE if no error). May - be NULL. */ -{ char *dst; Tcl_EncodingState state; - const Encoding *encodingPtr; - int result, srcRead, dstWrote, dstChars; - const char *srcStart = src; - Tcl_Size dstLen, soFar; + Encoding *encodingPtr; + int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; - /* DO FIRST - must always be initialized on return */ Tcl_DStringInit(dstPtr); - - if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { - /* TODO - what other flags are illegal? - See TIP 656 */ - Tcl_SetObjResult( - interp, - Tcl_NewStringObj( - "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", - TCL_INDEX_NONE)); - Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", (void *)NULL); - errno = EINVAL; - return TCL_ERROR; - } - dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; @@ -1612,53 +1302,23 @@ Tcl_UtfToExternalDStringEx( } else if (srcLen < 0) { srcLen = strlen(src); } - - flags |= TCL_ENCODING_START | TCL_ENCODING_END; + flags = TCL_ENCODING_START | TCL_ENCODING_END; while (1) { - result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, - srcLen, flags, &state, dst, dstLen, - &srcRead, &dstWrote, &dstChars); + result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, + srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, + &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); - src += srcRead; if (result != TCL_CONVERT_NOSPACE) { - Tcl_Size nBytesProcessed = (src - srcStart); - int i = soFar + encodingPtr->nullSize - 1; - while (i >= soFar) { - Tcl_DStringSetLength(dstPtr, i--); + if (encodingPtr->nullSize == 2) { + Tcl_DStringSetLength(dstPtr, soFar + 1); } - if (errorLocPtr) { - /* - * Do not write error message into interpreter if caller - * wants to know error location. - */ - *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; - } else { - /* Caller wants error message on failure */ - if (result != TCL_OK && interp != NULL) { - Tcl_Size pos = TclNumUtfChars(srcStart, nBytesProcessed); - int ucs4; - char buf[TCL_INTEGER_SPACE]; - TclUtfToUniChar(&srcStart[nBytesProcessed], &ucs4); - snprintf(buf, sizeof(buf), "%" TCL_SIZE_MODIFIER "d", nBytesProcessed); - Tcl_SetObjResult( - interp, - Tcl_ObjPrintf( - "unexpected character at index %" TCL_SIZE_MODIFIER - "u: 'U+%06X'", - pos, - ucs4)); - Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", - buf, (void *)NULL); - } - } - if (result != TCL_OK) { - errno = (result == TCL_CONVERT_NOSPACE) ? ENOMEM : EILSEQ; - } - return result; + Tcl_DStringSetLength(dstPtr, soFar); + return Tcl_DStringValue(dstPtr); } flags &= ~TCL_ENCODING_START; + src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -1689,11 +1349,11 @@ Tcl_UtfToExternalDStringEx( int Tcl_UtfToExternal( - TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */ + Tcl_Interp *interp, /* Interp for error return, if not NULL. */ Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ - Tcl_Size srcLen, /* Source string length in bytes, or < 0 for + int srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state @@ -1703,7 +1363,7 @@ Tcl_UtfToExternal( * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string * is stored. */ - Tcl_Size dstLen, /* The maximum length of output buffer in + int dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the * source string that were converted. This may @@ -1717,7 +1377,7 @@ Tcl_UtfToExternal( * correspond to the bytes stored in the * output buffer. */ { - const Encoding *encodingPtr; + Encoding *encodingPtr; int result, srcRead, dstWrote, dstChars; Tcl_EncodingState state; @@ -1745,18 +1405,14 @@ Tcl_UtfToExternal( dstCharsPtr = &dstChars; } - if (dstLen < encodingPtr->nullSize) { - return TCL_CONVERT_NOSPACE; - } dstLen -= encodingPtr->nullSize; - result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, - flags, statePtr, dst, dstLen, srcReadPtr, - dstWrotePtr, dstCharsPtr); - /* - * Buffer is terminated irrespective of result. Not sure this is - * reasonable but keep for historical/compatibility reasons. - */ - memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize); + result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, srcLen, + flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + dstCharsPtr); + if (encodingPtr->nullSize == 2) { + dst[*dstWrotePtr + 1] = '\0'; + } + dst[*dstWrotePtr] = '\0'; return result; } @@ -1774,20 +1430,19 @@ Tcl_UtfToExternal( * * Side effects: * The absolute pathname for the application is computed and stored to be - * returned later by [info nameofexecutable]. + * returned later be [info nameofexecutable]. * *--------------------------------------------------------------------------- */ -#undef Tcl_FindExecutable -const char * + +void Tcl_FindExecutable( const char *argv0) /* The value of the application's argv[0] * (native). */ { - const char *version = Tcl_InitSubsystems(); + TclInitSubsystems(); TclpSetInitialEncodings(); TclpFindExecutable(argv0); - return version; } /* @@ -1798,9 +1453,9 @@ Tcl_FindExecutable( * Open the file believed to hold data for the encoding, "name". * * Results: - * Returns the readable Tcl_Channel from opening the file, or NULL if the - * file could not be successfully opened. If NULL was returned, an error - * message is left in interp's result object, unless interp was NULL. + * Returns the readable Tcl_Channel from opening the file, or NULL if the + * file could not be successfully opened. If NULL was returned, an error + * message is left in interp's result object, unless interp was NULL. * * Side effects: * Channel may be opened. Information about the filesystem may be cached @@ -1815,17 +1470,17 @@ OpenEncodingFileChannel( const char *name) /* The name of the encoding file on disk and * also the name for new encoding. */ { - Tcl_Obj *nameObj = Tcl_NewStringObj(name, TCL_INDEX_NONE); + Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1); Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj); Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath()); Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap); Tcl_Obj **dir, *path, *directory = NULL; Tcl_Channel chan = NULL; - Tcl_Size i, numDirs; + int i, numDirs; - TclListObjGetElements(NULL, searchPath, &numDirs, &dir); + Tcl_ListObjGetElements(NULL, searchPath, &numDirs, &dir); Tcl_IncrRefCount(nameObj); - Tcl_AppendToObj(fileNameObj, ".enc", TCL_INDEX_NONE); + Tcl_AppendToObj(fileNameObj, ".enc", -1); Tcl_IncrRefCount(fileNameObj); Tcl_DictObjGet(NULL, map, nameObj, &directory); @@ -1842,10 +1497,9 @@ OpenEncodingFileChannel( } } if (!verified) { - const char *dirString = TclGetString(directory); - + const char *dirString = Tcl_GetString(directory); for (i=0; i<numDirs && !verified; i++) { - if (strcmp(dirString, TclGetString(dir[i])) == 0) { + if (strcmp(dirString, Tcl_GetString(dir[i])) == 0) { verified = 1; } } @@ -1896,9 +1550,8 @@ OpenEncodingFileChannel( } if ((NULL == chan) && (interp != NULL)) { - Tcl_SetObjResult(interp, Tcl_ObjPrintf( - "unknown encoding \"%s\"", name)); - Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, (void *)NULL); + Tcl_AppendResult(interp, "unknown encoding \"", name, "\"", NULL); + Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, NULL); } Tcl_DecrRefCount(fileNameObj); Tcl_DecrRefCount(nameObj); @@ -1916,13 +1569,13 @@ OpenEncodingFileChannel( * the data. * * Results: - * The return value is the newly loaded Tcl_Encoding or NULL if the file - * didn't exist or could not be processed. If NULL is returned and interp - * is not NULL, an error message is left in interp's result object. + * The return value is the newly loaded Encoding, or NULL if the file + * didn't exist of was in the incorrect format. If NULL was returned, an + * error message is left in interp's result object, unless interp was + * NULL. * * Side effects: - * A corresponding encoding file might be read from persistent storage, in - * which case LoadTableEncoding is called. + * File read from disk. * *--------------------------------------------------------------------------- */ @@ -1930,8 +1583,8 @@ OpenEncodingFileChannel( static Tcl_Encoding LoadEncodingFile( Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ - const char *name) /* The name of both the encoding file - * and the new encoding. */ + const char *name) /* The name of the encoding file on disk and + * also the name for new encoding. */ { Tcl_Channel chan = NULL; Tcl_Encoding encoding = NULL; @@ -1971,9 +1624,7 @@ LoadEncodingFile( break; } if ((encoding == NULL) && (interp != NULL)) { - Tcl_SetObjResult(interp, Tcl_ObjPrintf( - "invalid encoding file \"%s\"", name)); - Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, (void *)NULL); + Tcl_AppendResult(interp, "invalid encoding file \"", name, "\"", NULL); } Tcl_Close(NULL, chan); @@ -1985,38 +1636,38 @@ LoadEncodingFile( * * LoadTableEncoding -- * - * Helper function for LoadEncodingFile(). Creates a Tcl_EncodingType - * structure along with its corresponding TableEncodingData structure, and - * passes it to Tcl_Createncoding. + * Helper function for LoadEncodingTable(). Loads a table to that + * converts between Unicode and some other encoding and creates an + * encoding (using a TableEncoding structure) from that information. * - * The file contains binary data but begins with a marker to indicate - * byte-ordering so a single binary file can be read on big or - * little-endian systems. + * File contains binary data, but begins with a marker to indicate + * byte-ordering, so that same binary file can be read on either endian + * platforms. * * Results: - * Returns the new Tcl_Encoding, or NULL if it could could - * not be created because the file contained invalid data. + * The return value is the new encoding, or NULL if the encoding could + * not be created (because the file contained invalid data). * * Side effects: - * See Tcl_CreateEncoding(). + * None. * *------------------------------------------------------------------------- */ static Tcl_Encoding LoadTableEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ int type, /* Type of encoding (ENCODING_?????). */ Tcl_Channel chan) /* File containing new encoding. */ { Tcl_DString lineString; Tcl_Obj *objPtr; char *line; - int i, hi, lo, numPages, symbol, fallback, len; + int i, hi, lo, numPages, symbol, fallback; unsigned char used[256]; - unsigned size; + unsigned int size; TableEncodingData *dataPtr; - unsigned short *pageMemPtr, *page; + unsigned short *pageMemPtr; Tcl_EncodingType encType; /* @@ -2044,9 +1695,7 @@ LoadTableEncoding( }; Tcl_DStringInit(&lineString); - if (Tcl_Gets(chan, &lineString) < 0) { - return NULL; - } + Tcl_Gets(chan, &lineString); line = Tcl_DStringValue(&lineString); fallback = (int) strtol(line, &line, 16); @@ -2065,7 +1714,7 @@ LoadTableEncoding( #undef PAGESIZE #define PAGESIZE (256 * sizeof(unsigned short)) - dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData)); + dataPtr = (TableEncodingData *) ckalloc(sizeof(TableEncodingData)); memset(dataPtr, 0, sizeof(TableEncodingData)); dataPtr->fallback = fallback; @@ -2077,7 +1726,7 @@ LoadTableEncoding( */ size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; - dataPtr->toUnicode = (unsigned short **)ckalloc(size); + dataPtr->toUnicode = (unsigned short **) ckalloc(size); memset(dataPtr->toUnicode, 0, size); pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256); @@ -2085,18 +1734,15 @@ LoadTableEncoding( Tcl_IncrRefCount(objPtr); for (i = 0; i < numPages; i++) { int ch; - const char *p; - Tcl_Size expected = 3 + 16 * (16 * 4 + 1); + char *p; - if (Tcl_ReadChars(chan, objPtr, expected, 0) != expected) { - return NULL; - } - p = TclGetString(objPtr); + Tcl_ReadChars(chan, objPtr, 3 + 16 * (16 * 4 + 1), 0); + p = Tcl_GetString(objPtr); hi = (staticHex[UCHAR(p[0])] << 4) + staticHex[UCHAR(p[1])]; dataPtr->toUnicode[hi] = pageMemPtr; p += 2; for (lo = 0; lo < 256; lo++) { - if ((lo & 0x0F) == 0) { + if ((lo & 0x0f) == 0) { p++; } ch = (staticHex[UCHAR(p[0])] << 12) + (staticHex[UCHAR(p[1])] << 8) @@ -2122,10 +1768,10 @@ LoadTableEncoding( } /* - * Invert the toUnicode array to produce the fromUnicode array. Performs a + * Invert toUnicode array to produce the fromUnicode array. Performs a * single malloc to get the memory for the array and all the pages needed - * by the array. While reading in the toUnicode array remember what - * pages are needed for the fromUnicode array. + * by the array. While reading in the toUnicode array, we remembered what + * pages that would be needed for the fromUnicode array. */ if (symbol) { @@ -2138,52 +1784,57 @@ LoadTableEncoding( } } size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; - dataPtr->fromUnicode = (unsigned short **)ckalloc(size); + dataPtr->fromUnicode = (unsigned short **) ckalloc(size); memset(dataPtr->fromUnicode, 0, size); pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256); for (hi = 0; hi < 256; hi++) { if (dataPtr->toUnicode[hi] == NULL) { dataPtr->toUnicode[hi] = emptyPage; - continue; - } - for (lo = 0; lo < 256; lo++) { - int ch = dataPtr->toUnicode[hi][lo]; - - if (ch != 0) { - page = dataPtr->fromUnicode[ch >> 8]; - if (page == NULL) { - page = pageMemPtr; - pageMemPtr += 256; - dataPtr->fromUnicode[ch >> 8] = page; + } else { + for (lo = 0; lo < 256; lo++) { + int ch; + + ch = dataPtr->toUnicode[hi][lo]; + if (ch != 0) { + unsigned short *page; + + page = dataPtr->fromUnicode[ch >> 8]; + if (page == NULL) { + page = pageMemPtr; + pageMemPtr += 256; + dataPtr->fromUnicode[ch >> 8] = page; + } + page[ch & 0xff] = (unsigned short) ((hi << 8) + lo); } - page[ch & 0xFF] = (unsigned short) ((hi << 8) + lo); } } } if (type == ENCODING_MULTIBYTE) { /* * If multibyte encodings don't have a backslash character, define - * one. Otherwise, on Windows, native file names don't work because - * the backslash in the file name maps to the unknown character + * one. Otherwise, on Windows, native file names won't work because + * the backslash in the file name will map to the unknown character * (question mark) when converting from UTF-8 to external encoding. */ if (dataPtr->fromUnicode[0] != NULL) { - if (dataPtr->fromUnicode[0][(int)'\\'] == '\0') { - dataPtr->fromUnicode[0][(int)'\\'] = '\\'; + if (dataPtr->fromUnicode[0]['\\'] == '\0') { + dataPtr->fromUnicode[0]['\\'] = '\\'; } } } if (symbol) { + unsigned short *page; + /* - * Make a special symbol encoding that maps each symbol character from - * its Unicode code point down into page 0, and also ensure that each - * characters on page 0 maps to itself so that a symbol font can be - * used to display a simple string like "abcd" and have alpha, beta, - * chi, delta show up, rather than have "unknown" chars show up because - * strictly speaking the symbol font doesn't have glyphs for those low - * ASCII chars. + * Make a special symbol encoding that not only maps the symbol + * characters from their Unicode code points down into page 0, but + * also ensure that the characters on page 0 map to themselves. This + * is so that a symbol font can be used to display a simple string + * like "abcd" and have alpha, beta, chi, delta show up, rather than + * have "unknown" chars show up because strictly speaking the symbol + * font doesn't have glyphs for those low ascii chars. */ page = dataPtr->fromUnicode[0]; @@ -2208,77 +1859,57 @@ LoadTableEncoding( */ Tcl_DStringInit(&lineString); - - /* - * Skip leading empty lines. - */ - - while ((len = Tcl_Gets(chan, &lineString)) == 0) { - /* empty body */ - } - if (len < 0) { - goto doneParse; - } - - /* - * Require that it starts with an 'R'. - */ - - line = Tcl_DStringValue(&lineString); - if (line[0] != 'R') { - goto doneParse; - } - - /* - * Read lines until EOF. - */ - - for (TclDStringClear(&lineString); - (len = Tcl_Gets(chan, &lineString)) >= 0; - TclDStringClear(&lineString)) { - const unsigned char *p; - int to, from; + do { + int len; /* - * Skip short lines. + * Skip leading empty lines. */ - if (len < 5) { - continue; + while ((len = Tcl_Gets(chan, &lineString)) == 0) { + /* empty body */ } - /* - * Parse the line as a sequence of hex digits. - */ - - p = (const unsigned char *) Tcl_DStringValue(&lineString); - to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) - + (staticHex[p[2]] << 4) + staticHex[p[3]]; - if (to == 0) { - continue; + if (len < 0) { + break; } - for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) { - from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) - + (staticHex[p[2]] << 4) + staticHex[p[3]]; - if (from == 0) { + line = Tcl_DStringValue(&lineString); + if (line[0] != 'R') { + break; + } + for (Tcl_DStringSetLength(&lineString, 0); + (len = Tcl_Gets(chan, &lineString)) >= 0; + Tcl_DStringSetLength(&lineString, 0)) { + unsigned char* p; + int to, from; + + if (len < 5) { continue; } - dataPtr->fromUnicode[from >> 8][from & 0xFF] = to; + p = (unsigned char*) Tcl_DStringValue(&lineString); + to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) + + (staticHex[p[2]] << 4) + staticHex[p[3]]; + if (to == 0) { + continue; + } + for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) { + from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) + + (staticHex[p[2]] << 4) + staticHex[p[3]]; + if (from == 0) { + continue; + } + dataPtr->fromUnicode[from >> 8][from & 0xff] = to; + } } - } - doneParse: + } while (0); Tcl_DStringFree(&lineString); - /* - * Package everything into an encoding structure. - */ - encType.encodingName = name; encType.toUtfProc = TableToUtfProc; encType.fromUtfProc = TableFromUtfProc; encType.freeProc = TableFreeProc; encType.nullSize = (type == ENCODING_DOUBLEBYTE) ? 2 : 1; - encType.clientData = dataPtr; + encType.clientData = (ClientData) dataPtr; return Tcl_CreateEncoding(&encType); } @@ -2307,11 +1938,11 @@ LoadTableEncoding( static Tcl_Encoding LoadEscapeEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ Tcl_Channel chan) /* File containing new encoding. */ { int i; - unsigned size; + unsigned int size; Tcl_DString escapeData; char init[16], final[16]; EscapeEncodingData *dataPtr; @@ -2322,7 +1953,7 @@ LoadEscapeEncoding( Tcl_DStringInit(&escapeData); while (1) { - Tcl_Size argc; + int argc; const char **argv; char *line; Tcl_DString lineString; @@ -2333,7 +1964,6 @@ LoadEscapeEncoding( } line = Tcl_DStringValue(&lineString); if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) { - Tcl_DStringFree(&lineString); continue; } if (argc >= 2) { @@ -2361,8 +1991,8 @@ LoadEscapeEncoding( */ e = (Encoding *) Tcl_GetEncoding(NULL, est.name); - if ((e != NULL) && (e->toUtfProc != TableToUtfProc) - && (e->toUtfProc != Iso88591ToUtfProc)) { + if (e && e->toUtfProc != TableToUtfProc && + e->toUtfProc != Iso88591ToUtfProc) { Tcl_FreeEncoding((Tcl_Encoding) e); e = NULL; } @@ -2370,21 +2000,21 @@ LoadEscapeEncoding( Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est)); } } - ckfree(argv); + ckfree((char *) argv); Tcl_DStringFree(&lineString); } - size = offsetof(EscapeEncodingData, subTables) + size = sizeof(EscapeEncodingData) - sizeof(EscapeSubTable) + Tcl_DStringLength(&escapeData); - dataPtr = (EscapeEncodingData *)ckalloc(size); + dataPtr = (EscapeEncodingData *) ckalloc(size); dataPtr->initLen = strlen(init); - memcpy(dataPtr->init, init, dataPtr->initLen + 1); + strcpy(dataPtr->init, init); dataPtr->finalLen = strlen(final); - memcpy(dataPtr->final, final, dataPtr->finalLen + 1); + strcpy(dataPtr->final, final); dataPtr->numSubTables = Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable); memcpy(dataPtr->subTables, Tcl_DStringValue(&escapeData), - Tcl_DStringLength(&escapeData)); + (size_t) Tcl_DStringLength(&escapeData)); Tcl_DStringFree(&escapeData); memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes)); @@ -2398,16 +2028,12 @@ LoadEscapeEncoding( dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1; } - /* - * Package everything into an encoding structure. - */ - type.encodingName = name; type.toUtfProc = EscapeToUtfProc; type.fromUtfProc = EscapeFromUtfProc; type.freeProc = EscapeFreeProc; type.nullSize = 1; - type.clientData = dataPtr; + type.clientData = (ClientData) dataPtr; return Tcl_CreateEncoding(&type); } @@ -2432,11 +2058,15 @@ LoadEscapeEncoding( static int BinaryProc( - TCL_UNUSED(void *), + ClientData clientData, /* Not used. */ const char *src, /* Source string (unknown encoding). */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2457,9 +2087,6 @@ BinaryProc( if (dstLen < 0) { dstLen = 0; } - if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { - srcLen = *dstCharsPtr; - } if (srcLen > dstLen) { srcLen = dstLen; result = TCL_CONVERT_NOSPACE; @@ -2468,18 +2095,18 @@ BinaryProc( *srcReadPtr = srcLen; *dstWrotePtr = srcLen; *dstCharsPtr = srcLen; - memcpy(dst, src, srcLen); + memcpy(dst, src, (size_t) srcLen); return result; } /* *------------------------------------------------------------------------- * - * UtfToUtfProc -- + * UtfExtToUtfIntProc -- * - * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation - * is not a no-op, because it turns a stream of improperly formed - * UTF-8 into a properly-formed stream. + * Convert from UTF-8 to UTF-8. While converting null-bytes from the + * Tcl's internal representation (0xc0, 0x80) to the official + * representation (0x00). See UtfToUtfProc for details. * * Results: * Returns TCL_OK if conversion was successful. @@ -2491,14 +2118,18 @@ BinaryProc( */ static int -UtfToUtfProc( - void *clientData, /* additional flags */ +UtfIntToUtfExtProc( + ClientData clientData, /* Not used. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ - int flags, /* TCL_ENCODING_* conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), - char *dst, /* Output buffer in which converted string is - * stored. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string + * is stored. */ int dstLen, /* The maximum length of output buffer in * bytes. */ int *srcReadPtr, /* Filled with the number of bytes from the @@ -2513,179 +2144,18 @@ UtfToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - const char *srcStart, *srcEnd, *srcClose; - const char *dstStart, *dstEnd; - int result, numChars, charLimit = INT_MAX; - int ch; - int profile; - - result = TCL_OK; - - srcStart = src; - srcEnd = src + srcLen; - srcClose = srcEnd; - if ((flags & TCL_ENCODING_END) == 0) { - srcClose -= 6; - } - if (flags & TCL_ENCODING_CHAR_LIMIT) { - charLimit = *dstCharsPtr; - } - - dstStart = dst; - flags |= PTR2INT(clientData); - dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); - - profile = ENCODING_PROFILE_GET(flags); - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - - if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { - /* - * If there is more string to follow, this will ensure that the - * last UTF-8 character in the source buffer hasn't been cut off. - */ - - result = TCL_CONVERT_MULTIBYTE; - break; - } - if (dst > dstEnd) { - result = TCL_CONVERT_NOSPACE; - break; - } - if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) { - /* - * Copy 7bit characters, but skip null-bytes when we are in input - * mode, so that they get converted to \xC0\x80. - */ - *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && - (UCHAR(src[1]) == 0x80) && - (!(flags & ENCODING_INPUT) || !PROFILE_TCL8(profile))) { - /* Special sequence \xC0\x80 */ - if (!PROFILE_TCL8(profile) && (flags & ENCODING_INPUT)) { - if (PROFILE_REPLACE(profile)) { - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - src += 2; - } else { - /* PROFILE_STRICT */ - result = TCL_CONVERT_SYNTAX; - break; - } - } else { - /* - * Convert 0xC080 to real nulls when we are in output mode, - * irrespective of the profile. - */ - *dst++ = 0; - src += 2; - } - - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { - /* - * Incomplete byte sequence. - * Always check before using Tcl_UtfToUniChar. Not doing so can cause - * it to run beyond the end of the buffer! If we happen on such an - * incomplete char its bytes are made to represent themselves unless - * the user has explicitly asked to be told. - */ - - if (flags & ENCODING_INPUT) { - /* Incomplete bytes for modified UTF-8 target */ - if (PROFILE_STRICT(profile)) { - result = (flags & TCL_ENCODING_CHAR_LIMIT) - ? TCL_CONVERT_MULTIBYTE - : TCL_CONVERT_SYNTAX; - break; - } - } - if (PROFILE_REPLACE(profile)) { - ch = UNICODE_REPLACE_CHAR; - ++src; - } else { - /* TCL_ENCODING_PROFILE_TCL8 */ - char chbuf[2]; - chbuf[0] = UCHAR(*src++); chbuf[1] = 0; - TclUtfToUniChar(chbuf, &ch); - } - dst += Tcl_UniCharToUtf(ch, dst); - } else { - int low; - size_t len = TclUtfToUniChar(src, &ch); - if (flags & ENCODING_INPUT) { - if (((len < 2) && (ch != 0)) || ((ch > 0xFFFF) && !(flags & ENCODING_UTF))) { - if (PROFILE_STRICT(profile)) { - result = TCL_CONVERT_SYNTAX; - break; - } else if (PROFILE_REPLACE(profile)) { - ch = UNICODE_REPLACE_CHAR; - } - } - } - - const char *saveSrc = src; - src += len; - if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) { - if (ch > 0xFFFF) { - /* CESU-8 6-byte sequence for chars > U+FFFF */ - ch -= 0x10000; - *dst++ = 0xED; - *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); - *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); - ch = (ch & 0x0CFF) | 0xDC00; - } - goto cesu8; - } else if ((ch | 0x7FF) == 0xDFFF) { - /* - * A surrogate character is detected, handle especially. - */ - if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; - } - if (PROFILE_REPLACE(profile)) { - ch = UNICODE_REPLACE_CHAR; - } else { - low = ch; - len = (src <= srcEnd - 3) ? TclUtfToUniChar(src, &low) : 0; - - if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { - - if (PROFILE_STRICT(profile)) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; - } -cesu8: - *dst++ = (char)(((ch >> 12) | 0xE0) & 0xEF); - *dst++ = (char)(((ch >> 6) | 0x80) & 0xBF); - *dst++ = (char)((ch | 0x80) & 0xBF); - continue; - } - src += len; - dst += Tcl_UniCharToUtf(ch, dst); - ch = low; - } - } else if (SURROGATE(ch) && PROFILE_STRICT(profile)) { - result = (flags & ENCODING_INPUT) ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; - } - dst += Tcl_UniCharToUtf(ch, dst); - } - } - - *srcReadPtr = src - srcStart; - *dstWrotePtr = dst - dstStart; - *dstCharsPtr = numChars; - return result; + return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, + srcReadPtr, dstWrotePtr, dstCharsPtr, 1); } - + /* *------------------------------------------------------------------------- * - * Utf32ToUtfProc -- + * UtfExtToUtfIntProc -- * - * Convert from UTF-32 to UTF-8. + * Convert from UTF-8 to UTF-8 while converting null-bytes from the + * official representation (0x00) to Tcl's internal representation (0xc0, + * 0x80). See UtfToUtfProc for details. * * Results: * Returns TCL_OK if conversion was successful. @@ -2695,14 +2165,17 @@ cesu8: * *------------------------------------------------------------------------- */ - static int -Utf32ToUtfProc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ - const char *src, /* Source string in Unicode. */ +UtfExtToUtfIntProc( + ClientData clientData, /* Not used. */ + const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2719,130 +2192,18 @@ Utf32ToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - const char *srcStart, *srcEnd; - const char *dstEnd, *dstStart; - int result, numChars, charLimit = INT_MAX; - int ch = 0, bytesLeft = srcLen % 4; - - flags |= PTR2INT(clientData); - if (flags & TCL_ENCODING_CHAR_LIMIT) { - charLimit = *dstCharsPtr; - } - result = TCL_OK; - - /* - * Check alignment with utf-32 (4 == sizeof(UTF-32)) - */ - if (bytesLeft != 0) { - /* We have a truncated code unit */ - result = TCL_CONVERT_MULTIBYTE; - srcLen -= bytesLeft; - } - - /* - * If last code point is a high surrogate, we cannot handle that yet, - * unless we are at the end. - */ - - if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) && - ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) && - ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) && - ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) { - result = TCL_CONVERT_MULTIBYTE; - srcLen-= 4; - } - - srcStart = src; - srcEnd = src + srcLen; - - dstStart = dst; - dstEnd = dst + dstLen - TCL_UTF_MAX; - - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if (dst > dstEnd) { - result = TCL_CONVERT_NOSPACE; - break; - } - - int prev = ch; - if (flags & TCL_ENCODING_LE) { - ch = (unsigned int)(src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); - } else { - ch = (unsigned int)(src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); - } - if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { - /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); - } - - if ((unsigned)ch > 0x10FFFF) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - break; - } - ch = UNICODE_REPLACE_CHAR; - } else if (SURROGATE(ch)) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - ch = 0; - break; - } - if (PROFILE_REPLACE(flags)) { - ch = UNICODE_REPLACE_CHAR; - } - } - - /* - * Special case for 1-byte utf chars for speed. Make sure we work with - * unsigned short-size data. - */ - - if ((unsigned)ch - 1 < 0x7F) { - *dst++ = (ch & 0xFF); - } else { - if (!HIGH_SURROGATE(prev) && LOW_SURROGATE(ch)) { - *dst = 0; /* In case of lower surrogate, don't try to combine */ - } - dst += Tcl_UniCharToUtf(ch, dst); - } - src += 4; - } - - if (HIGH_SURROGATE(ch)) { - /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); - } - - if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - /* We have a code fragment left-over at the end */ - if (dst > dstEnd) { - result = TCL_CONVERT_NOSPACE; - } else { - /* destination is not full, so we really are at the end now */ - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - } else { - /* PROFILE_REPLACE or PROFILE_TCL8 */ - result = TCL_OK; - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - numChars++; - src += bytesLeft; /* Go past truncated code unit */ - } - } - } - - *srcReadPtr = src - srcStart; - *dstWrotePtr = dst - dstStart; - *dstCharsPtr = numChars; - return result; + return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, + srcReadPtr, dstWrotePtr, dstCharsPtr, 0); } - + /* *------------------------------------------------------------------------- * - * UtfToUtf32Proc -- + * UtfToUtfProc -- * - * Convert from UTF-8 to UTF-32. + * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation + * is not a no-op, because it will turn a stream of improperly formed + * UTF-8 into a properly formed stream. * * Results: * Returns TCL_OK if conversion was successful. @@ -2854,12 +2215,16 @@ Utf32ToUtfProc( */ static int -UtfToUtf32Proc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ +UtfToUtfProc( + ClientData clientData, /* Not used. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2872,13 +2237,19 @@ UtfToUtf32Proc( int *dstWrotePtr, /* Filled with the number of bytes that were * stored in the output buffer as a result of * the conversion. */ - int *dstCharsPtr) /* Filled with the number of characters that + int *dstCharsPtr, /* Filled with the number of characters that * correspond to the bytes stored in the * output buffer. */ + int pureNullMode) /* Convert embedded nulls from internal + * representation to real null-bytes or vice + * versa. */ { - const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + const char *srcStart, *srcEnd, *srcClose; + char *dstStart, *dstEnd; int result, numChars; - int ch, len; + Tcl_UniChar ch; + + result = TCL_OK; srcStart = src; srcEnd = src + srcLen; @@ -2888,10 +2259,8 @@ UtfToUtf32Proc( } dstStart = dst; - dstEnd = dst + dstLen - sizeof(Tcl_UniChar); - flags |= PTR2INT(clientData); + dstEnd = dst + dstLen - TCL_UTF_MAX; - result = TCL_OK; for (numChars = 0; src < srcEnd; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* @@ -2906,27 +2275,34 @@ UtfToUtf32Proc( result = TCL_CONVERT_NOSPACE; break; } - len = TclUtfToUniChar(src, &ch); - if (SURROGATE(ch)) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_UNKNOWN; - break; - } - if (PROFILE_REPLACE(flags)) { - ch = UNICODE_REPLACE_CHAR; - } - } - src += len; - if (flags & TCL_ENCODING_LE) { - *dst++ = (ch & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = ((ch >> 24) & 0xFF); + if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) { + /* + * Copy 7bit chatacters, but skip null-bytes when we are in input + * mode, so that they get converted to 0xc080. + */ + + *dst++ = *src++; + } else if (pureNullMode == 1 && UCHAR(*src) == 0xc0 && + (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { + /* + * Convert 0xc080 to real nulls when we are in output mode. + */ + + *dst++ = 0; + src += 2; + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { + /* + * Always check before using Tcl_UtfToUniChar. Not doing can so + * cause it run beyond the endof the buffer! If we happen such an + * incomplete char its byts are made to represent themselves. + */ + + ch = (unsigned char) *src; + src += 1; + dst += Tcl_UniCharToUtf(ch, dst); } else { - *dst++ = ((ch >> 24) & 0xFF); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = (ch & 0xFF); + src += Tcl_UtfToUniChar(src, &ch); + dst += Tcl_UniCharToUtf(ch, dst); } } @@ -2939,9 +2315,9 @@ UtfToUtf32Proc( /* *------------------------------------------------------------------------- * - * Utf16ToUtfProc -- + * UnicodeToUtfProc -- * - * Convert from UTF-16 to UTF-8. + * Convert from Unicode to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2953,12 +2329,16 @@ UtfToUtf32Proc( */ static int -Utf16ToUtfProc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ +UnicodeToUtfProc( + ClientData clientData, /* Not used. */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2976,34 +2356,15 @@ Utf16ToUtfProc( * output buffer. */ { const char *srcStart, *srcEnd; - const char *dstEnd, *dstStart; - int result, numChars, charLimit = INT_MAX; - unsigned short ch = 0; + char *dstEnd, *dstStart; + int result, numChars; + Tcl_UniChar ch; - flags |= PTR2INT(clientData); - if (flags & TCL_ENCODING_CHAR_LIMIT) { - charLimit = *dstCharsPtr; - } result = TCL_OK; - - /* - * Check alignment with utf-16 (2 == sizeof(UTF-16)) - */ - - if ((srcLen % 2) != 0) { - result = TCL_CONVERT_MULTIBYTE; - srcLen--; - } - - /* - * If last code point is a high surrogate, we cannot handle that yet, - * unless we are at the end. - */ - - if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) && - ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) { + if ((srcLen % sizeof(Tcl_UniChar)) != 0) { result = TCL_CONVERT_MULTIBYTE; - srcLen-= 2; + srcLen /= sizeof(Tcl_UniChar); + srcLen *= sizeof(Tcl_UniChar); } srcStart = src; @@ -3012,101 +2373,22 @@ Utf16ToUtfProc( dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) { + for (numChars = 0; src < srcEnd; numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } - - unsigned short prev = ch; - if (flags & TCL_ENCODING_LE) { - ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); - } else { - ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); - } - if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - src -= 2; /* Go back to beginning of high surrogate */ - dst--; /* Also undo writing a single byte too much */ - numChars--; - break; - } else if (PROFILE_REPLACE(flags)) { - /* - * Previous loop wrote a single byte to mark the high surrogate. - * Replace it with the replacement character. Further, restart - * current loop iteration since need to recheck destination space - * and reset processing of current character. - */ - ch = UNICODE_REPLACE_CHAR; - dst--; - dst += Tcl_UniCharToUtf(ch, dst); - src -= 2; - numChars--; - continue; - } else { - /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); - } - } - /* - * Special case for 1-byte utf chars for speed. Make sure we work with - * unsigned short-size data. + * Special case for 1-byte utf chars for speed. Make sure we + * work with Tcl_UniChar-size data. */ - - if ((unsigned)ch - 1 < 0x7F) { + ch = *(Tcl_UniChar *)src; + if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); - } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) { - dst += Tcl_UniCharToUtf(ch, dst); - } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) { - /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */ - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - break; - } else { - /* PROFILE_REPLACE */ - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - } } else { - *dst = 0; /* In case of lower surrogate, don't try to combine */ dst += Tcl_UniCharToUtf(ch, dst); } - } - - if (HIGH_SURROGATE(ch)) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - src -= 2; - dst--; - numChars--; - } else if (PROFILE_REPLACE(flags)) { - dst--; - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - } else { - /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ - dst += Tcl_UniCharToUtf(-1, dst); - } - } - - /* - * If we had a truncated code unit at the end AND this is the last - * fragment AND profile is not "strict", stick FFFD in its place. - */ - if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - if (dst > dstEnd) { - result = TCL_CONVERT_NOSPACE; - } else { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - } else { - /* PROFILE_REPLACE or PROFILE_TCL8 */ - result = TCL_OK; - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - numChars++; - src++; /* Go past truncated code unit */ - } - } + src += sizeof(Tcl_UniChar); } *srcReadPtr = src - srcStart; @@ -3118,9 +2400,9 @@ Utf16ToUtfProc( /* *------------------------------------------------------------------------- * - * UtfToUtf16Proc -- + * UtfToUnicodeProc -- * - * Convert from UTF-8 to UTF-16. + * Convert from UTF-8 to Unicode. * * Results: * Returns TCL_OK if conversion was successful. @@ -3132,12 +2414,17 @@ Utf16ToUtfProc( */ static int -UtfToUtf16Proc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ +UtfToUnicodeProc( + ClientData clientData, /* TableEncodingData that specifies + * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -3156,117 +2443,8 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - int ch, len; - - srcStart = src; - srcEnd = src + srcLen; - srcClose = srcEnd; - if ((flags & TCL_ENCODING_END) == 0) { - srcClose -= TCL_UTF_MAX; - } - - dstStart = dst; - dstEnd = dst + dstLen - 2; /* 2 -> sizeof a UTF-16 code unit */ - flags |= PTR2INT(clientData); - - result = TCL_OK; - for (numChars = 0; src < srcEnd; numChars++) { - if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { - /* - * If there is more string to follow, this will ensure that the - * last UTF-8 character in the source buffer hasn't been cut off. - */ - - result = TCL_CONVERT_MULTIBYTE; - break; - } - if (dst > dstEnd) { - result = TCL_CONVERT_NOSPACE; - break; - } - len = TclUtfToUniChar(src, &ch); - if (SURROGATE(ch)) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_UNKNOWN; - break; - } - if (PROFILE_REPLACE(flags)) { - ch = UNICODE_REPLACE_CHAR; - } - } - src += len; - if (flags & TCL_ENCODING_LE) { - if (ch <= 0xFFFF) { - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); - } else { - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (ch & 0xFF); - *dst++ = ((ch >> 8) & 0x3) | 0xDC; - } - } else { - if (ch <= 0xFFFF) { - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); - } else { - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - *dst++ = ((ch >> 8) & 0x3) | 0xDC; - *dst++ = (ch & 0xFF); - } - } - } - *srcReadPtr = src - srcStart; - *dstWrotePtr = dst - dstStart; - *dstCharsPtr = numChars; - return result; -} - -/* - *------------------------------------------------------------------------- - * - * UtfToUcs2Proc -- - * - * Convert from UTF-8 to UCS-2. - * - * Results: - * Returns TCL_OK if conversion was successful. - * - * Side effects: - * None. - * - *------------------------------------------------------------------------- - */ - -static int -UtfToUcs2Proc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */ - const char *src, /* Source string in UTF-8. */ - int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), - char *dst, /* Output buffer in which converted string is - * stored. */ - int dstLen, /* The maximum length of output buffer in - * bytes. */ - int *srcReadPtr, /* Filled with the number of bytes from the - * source string that were converted. This may - * be less than the original source length if - * there was a problem converting some source - * characters. */ - int *dstWrotePtr, /* Filled with the number of bytes that were - * stored in the output buffer as a result of - * the conversion. */ - int *dstCharsPtr) /* Filled with the number of characters that - * correspond to the bytes stored in the - * output buffer. */ -{ - const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; - int result, numChars, len; - Tcl_UniChar ch = 0; + Tcl_UniChar ch; - flags |= PTR2INT(clientData); srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -3275,7 +2453,7 @@ UtfToUcs2Proc( } dstStart = dst; - dstEnd = dst + dstLen - 2; /* 2 - size of UCS code unit */ + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); result = TCL_OK; for (numChars = 0; src < srcEnd; numChars++) { @@ -3291,34 +2469,20 @@ UtfToUcs2Proc( if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; - } - len = TclUtfToUniChar(src, &ch); - if (ch > 0xFFFF) { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_UNKNOWN; - break; - } - ch = UNICODE_REPLACE_CHAR; - } - if (PROFILE_STRICT(flags) && SURROGATE(ch)) { - result = TCL_CONVERT_SYNTAX; - break; - } - - src += len; - + } + src += TclUtfToUniChar(src, &ch); /* - * Need to handle this in a way that won't cause misalignment by - * casting dst to a Tcl_UniChar. [Bug 1122671] + * Need to handle this in a way that won't cause misalignment + * by casting dst to a Tcl_UniChar. [Bug 1122671] + * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. */ - - if (flags & TCL_ENCODING_LE) { - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); - } else { - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); - } +#ifdef WORDS_BIGENDIAN + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); +#else + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); +#endif } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; @@ -3345,12 +2509,16 @@ UtfToUcs2Proc( static int TableToUtfProc( - void *clientData, /* TableEncodingData that specifies + ClientData clientData, /* TableEncodingData that specifies * encoding. */ const char *src, /* Source string in specified encoding. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -3368,78 +2536,56 @@ TableToUtfProc( * output buffer. */ { const char *srcStart, *srcEnd; - const char *dstEnd, *dstStart, *prefixBytes; - int result, byte, numChars, charLimit = INT_MAX; - Tcl_UniChar ch = 0; - const unsigned short *const *toUnicode; - const unsigned short *pageZero; - TableEncodingData *dataPtr = (TableEncodingData *)clientData; - - if (flags & TCL_ENCODING_CHAR_LIMIT) { - charLimit = *dstCharsPtr; - } + char *dstEnd, *dstStart, *prefixBytes; + int result, byte, numChars; + Tcl_UniChar ch; + unsigned short **toUnicode; + unsigned short *pageZero; + TableEncodingData *dataPtr; + srcStart = src; srcEnd = src + srcLen; dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - toUnicode = (const unsigned short *const *) dataPtr->toUnicode; + dataPtr = (TableEncodingData *) clientData; + toUnicode = dataPtr->toUnicode; prefixBytes = dataPtr->prefixBytes; pageZero = toUnicode[0]; result = TCL_OK; - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + for (numChars = 0; src < srcEnd; numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } byte = *((unsigned char *) src); if (prefixBytes[byte]) { - if (src >= srcEnd-1) { - /* Prefix byte but nothing after it */ - if (!(flags & TCL_ENCODING_END)) { - /* More data to come */ - result = TCL_CONVERT_MULTIBYTE; - break; - } else if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - break; - } else if (PROFILE_REPLACE(flags)) { - ch = UNICODE_REPLACE_CHAR; - } else { - /* For prefix bytes, we don't fallback to cp1252, see [1355b9a874] */ - ch = byte; - } - } else { - ch = toUnicode[byte][*((unsigned char *)++src)]; + src++; + if (src >= srcEnd) { + src--; + result = TCL_CONVERT_MULTIBYTE; + break; } + ch = toUnicode[byte][*((unsigned char *) src)]; } else { ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - /* Prefix+suffix pair is invalid */ - if (PROFILE_STRICT(flags)) { + if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_SYNTAX; break; } if (prefixBytes[byte]) { src--; } - if (PROFILE_REPLACE(flags)) { - ch = UNICODE_REPLACE_CHAR; - } else { - char chbuf[2]; - chbuf[0] = byte; chbuf[1] = 0; - TclUtfToUniChar(chbuf, &ch); - } + ch = (Tcl_UniChar) byte; } - /* - * Special case for 1-byte Utf chars for speed. + * Special case for 1-byte utf chars for speed. */ - - if ((unsigned)ch - 1 < 0x7F) { + if (ch && ch < 0x80) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -3447,7 +2593,6 @@ TableToUtfProc( src++; } - assert(src <= srcEnd); *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -3473,12 +2618,16 @@ TableToUtfProc( static int TableFromUtfProc( - void *clientData, /* TableEncodingData that specifies + ClientData clientData, /* TableEncodingData that specifies * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -3496,16 +2645,17 @@ TableFromUtfProc( * output buffer. */ { const char *srcStart, *srcEnd, *srcClose; - const char *dstStart, *dstEnd, *prefixBytes; - Tcl_UniChar ch = 0; + char *dstStart, *dstEnd, *prefixBytes; + Tcl_UniChar ch; int result, len, word, numChars; - TableEncodingData *dataPtr = (TableEncodingData *)clientData; - const unsigned short *const *fromUnicode; + TableEncodingData *dataPtr; + unsigned short **fromUnicode; result = TCL_OK; + dataPtr = (TableEncodingData *) clientData; prefixBytes = dataPtr->prefixBytes; - fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode; + fromUnicode = dataPtr->fromUnicode; srcStart = src; srcEnd = src + srcLen; @@ -3529,19 +2679,24 @@ TableFromUtfProc( } len = TclUtfToUniChar(src, &ch); - /* Unicode chars > +U0FFFF cannot be represented in any table encoding */ - if (ch & 0xFFFF0000) { +#if TCL_UTF_MAX > 3 + /* + * This prevents a crash condition. More evaluation is required for + * full support of int Tcl_UniChar. [Bug 1004065] + */ + + if (ch & 0xffff0000) { word = 0; - } else { - word = fromUnicode[(ch >> 8)][ch & 0xFF]; - } + } else +#endif + word = fromUnicode[(ch >> 8)][ch & 0xff]; if ((word == 0) && (ch != 0)) { - if (PROFILE_STRICT(flags)) { + if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } - word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */ + word = dataPtr->fallback; } if (prefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { @@ -3586,11 +2741,15 @@ TableFromUtfProc( static int Iso88591ToUtfProc( - TCL_UNUSED(void *), + ClientData clientData, /* Ignored. */ const char *src, /* Source string in specified encoding. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -3608,12 +2767,9 @@ Iso88591ToUtfProc( * output buffer. */ { const char *srcStart, *srcEnd; - const char *dstEnd, *dstStart; - int result, numChars, charLimit = INT_MAX; + char *dstEnd, *dstStart; + int result, numChars; - if (flags & TCL_ENCODING_CHAR_LIMIT) { - charLimit = *dstCharsPtr; - } srcStart = src; srcEnd = src + srcLen; @@ -3621,20 +2777,18 @@ Iso88591ToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; result = TCL_OK; - for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - Tcl_UniChar ch = 0; + for (numChars = 0; src < srcEnd; numChars++) { + Tcl_UniChar ch; if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } - ch = *((unsigned char *) src); - + ch = (Tcl_UniChar) *((unsigned char *) src); /* * Special case for 1-byte utf chars for speed. */ - - if ((unsigned)ch - 1 < 0x7F) { + if (ch && ch < 0x80) { *dst++ = (char) ch; } else { dst += Tcl_UniCharToUtf(ch, dst); @@ -3666,11 +2820,15 @@ Iso88591ToUtfProc( static int Iso88591FromUtfProc( - TCL_UNUSED(void *), + ClientData clientData, /* Ignored. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -3688,9 +2846,10 @@ Iso88591FromUtfProc( * output buffer. */ { const char *srcStart, *srcEnd, *srcClose; - const char *dstStart, *dstEnd; - int result = TCL_OK, numChars; - Tcl_UniChar ch = 0; + char *dstStart, *dstEnd; + int result, numChars; + + result = TCL_OK; srcStart = src; srcEnd = src + srcLen; @@ -3703,6 +2862,7 @@ Iso88591FromUtfProc( dstEnd = dst + dstLen - 1; for (numChars = 0; src < srcEnd; numChars++) { + Tcl_UniChar ch; int len; if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -3720,16 +2880,17 @@ Iso88591FromUtfProc( * Check for illegal characters. */ - if (ch > 0xFF) { - if (PROFILE_STRICT(flags)) { + if (ch > 0xff) { + if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } + /* * Plunge on, using '?' as a fallback character. */ - ch = '?'; /* Profiles TCL8 and REPLACE */ + ch = (Tcl_UniChar) '?'; } if (dst > dstEnd) { @@ -3765,20 +2926,19 @@ Iso88591FromUtfProc( static void TableFreeProc( - void *clientData) /* TableEncodingData that specifies + ClientData clientData) /* TableEncodingData that specifies * encoding. */ { - TableEncodingData *dataPtr = (TableEncodingData *)clientData; + TableEncodingData *dataPtr; /* * Make sure we aren't freeing twice on shutdown. [Bug 219314] */ - ckfree(dataPtr->toUnicode); - dataPtr->toUnicode = NULL; - ckfree(dataPtr->fromUnicode); - dataPtr->fromUnicode = NULL; - ckfree(dataPtr); + dataPtr = (TableEncodingData *) clientData; + ckfree((char *) dataPtr->toUnicode); + ckfree((char *) dataPtr->fromUnicode); + ckfree((char *) dataPtr); } /* @@ -3800,7 +2960,7 @@ TableFreeProc( static int EscapeToUtfProc( - void *clientData, /* EscapeEncodingData that specifies + ClientData clientData, /* EscapeEncodingData that specifies * encoding. */ const char *src, /* Source string in specified encoding. */ int srcLen, /* Source string length in bytes. */ @@ -3826,19 +2986,20 @@ EscapeToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; - const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; - const unsigned short *const *tableToUnicode; - const Encoding *encodingPtr; - int state, result, numChars, charLimit = INT_MAX; - const char *dstStart, *dstEnd; - - if (flags & TCL_ENCODING_CHAR_LIMIT) { - charLimit = *dstCharsPtr; - } + EscapeEncodingData *dataPtr; + char *prefixBytes, *tablePrefixBytes; + unsigned short **tableToUnicode; + Encoding *encodingPtr; + int state, result, numChars; + const char *srcStart, *srcEnd; + char *dstStart, *dstEnd; + result = TCL_OK; - tablePrefixBytes = NULL; - tableToUnicode = NULL; + + tablePrefixBytes = NULL; /* lint. */ + tableToUnicode = NULL; /* lint. */ + + dataPtr = (EscapeEncodingData *) clientData; prefixBytes = dataPtr->prefixBytes; encodingPtr = NULL; @@ -3853,7 +3014,7 @@ EscapeToUtfProc( state = 0; } - for (numChars = 0; src < srcEnd && numChars <= charLimit; ) { + for (numChars = 0; src < srcEnd; ) { int byte, hi, lo, ch; if (dst > dstEnd) { @@ -3862,9 +3023,9 @@ EscapeToUtfProc( } byte = *((unsigned char *) src); if (prefixBytes[byte]) { - unsigned left, len, longest; + unsigned int left, len, longest; int checked, i; - const EscapeSubTable *subTablePtr; + EscapeSubTable *subTablePtr; /* * Saw the beginning of an escape sequence. @@ -3943,11 +3104,12 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if (!PROFILE_STRICT(flags)) { - unsigned skip = longest > left ? left : longest; - /* Unknown escape sequence */ - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - src += skip; + if ((flags & TCL_ENCODING_STOPONERROR) == 0) { + /* + * Skip the unknown escape sequence. + */ + + src += longest; continue; } result = TCL_CONVERT_SYNTAX; @@ -3961,10 +3123,9 @@ EscapeToUtfProc( TableEncodingData *tableDataPtr; encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = (TableEncodingData *)encodingPtr->clientData; + tableDataPtr = (TableEncodingData *) encodingPtr->clientData; tablePrefixBytes = tableDataPtr->prefixBytes; - tableToUnicode = (const unsigned short *const*) - tableDataPtr->toUnicode; + tableToUnicode = tableDataPtr->toUnicode; } if (tablePrefixBytes[byte]) { @@ -4013,7 +3174,7 @@ EscapeToUtfProc( static int EscapeFromUtfProc( - void *clientData, /* EscapeEncodingData that specifies + ClientData clientData, /* EscapeEncodingData that specifies * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ @@ -4039,18 +3200,19 @@ EscapeFromUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; - const Encoding *encodingPtr; + EscapeEncodingData *dataPtr; + Encoding *encodingPtr; const char *srcStart, *srcEnd, *srcClose; - const char *dstStart, *dstEnd; + char *dstStart, *dstEnd; int state, result, numChars; - const TableEncodingData *tableDataPtr; - const char *tablePrefixBytes; - const unsigned short *const *tableFromUnicode; - Tcl_UniChar ch = 0; + TableEncodingData *tableDataPtr; + char *tablePrefixBytes; + unsigned short **tableFromUnicode; result = TCL_OK; + dataPtr = (EscapeEncodingData *) clientData; + srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -4062,7 +3224,7 @@ EscapeFromUtfProc( dstEnd = dst + dstLen - 1; /* - * RFC 1468 states that the text starts in ASCII, and switches to Japanese + * RFC1468 states that the text starts in ASCII, and switches to Japanese * characters, and that the text must end in ASCII. [Patch 474358] */ @@ -4073,21 +3235,21 @@ EscapeFromUtfProc( *dstWrotePtr = 0; return TCL_CONVERT_NOSPACE; } - memcpy(dst, dataPtr->init, dataPtr->initLen); + memcpy(dst, dataPtr->init, (size_t)dataPtr->initLen); dst += dataPtr->initLen; } else { state = PTR2INT(*statePtr); } encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = (const TableEncodingData *)encodingPtr->clientData; + tableDataPtr = (TableEncodingData *) encodingPtr->clientData; tablePrefixBytes = tableDataPtr->prefixBytes; - tableFromUnicode = (const unsigned short *const *) - tableDataPtr->fromUnicode; + tableFromUnicode = tableDataPtr->fromUnicode; for (numChars = 0; src < srcEnd; numChars++) { - unsigned len; + unsigned int len; int word; + Tcl_UniChar ch; if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* @@ -4099,17 +3261,17 @@ EscapeFromUtfProc( break; } len = TclUtfToUniChar(src, &ch); - word = tableFromUnicode[(ch >> 8)][ch & 0xFF]; + word = tableFromUnicode[(ch >> 8)][ch & 0xff]; if ((word == 0) && (ch != 0)) { int oldState; - const EscapeSubTable *subTablePtr; + EscapeSubTable *subTablePtr; oldState = state; for (state = 0; state < dataPtr->numSubTables; state++) { encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = (const TableEncodingData *)encodingPtr->clientData; - word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xFF]; + tableDataPtr = (TableEncodingData *) encodingPtr->clientData; + word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xff]; if (word != 0) { break; } @@ -4117,22 +3279,21 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (PROFILE_STRICT(flags)) { + if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = (const TableEncodingData *)encodingPtr->clientData; + tableDataPtr = (TableEncodingData *) encodingPtr->clientData; word = tableDataPtr->fallback; } - tablePrefixBytes = (const char *) tableDataPtr->prefixBytes; - tableFromUnicode = (const unsigned short *const *) - tableDataPtr->fromUnicode; + tablePrefixBytes = tableDataPtr->prefixBytes; + tableFromUnicode = tableDataPtr->fromUnicode; /* * The state variable has the value of oldState when word is 0. - * In this case, the escape sequence should not be copied to dst + * In this case, the escape sequense should not be copied to dst * because the current character set is not changed. */ @@ -4150,7 +3311,8 @@ EscapeFromUtfProc( result = TCL_CONVERT_NOSPACE; break; } - memcpy(dst, subTablePtr->sequence, subTablePtr->sequenceLen); + memcpy(dst, subTablePtr->sequence, + (size_t) subTablePtr->sequenceLen); dst += subTablePtr->sequenceLen; } } @@ -4175,25 +3337,25 @@ EscapeFromUtfProc( } if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) { - unsigned len = dataPtr->subTables[0].sequenceLen; - + unsigned int len = dataPtr->subTables[0].sequenceLen; /* - * Certain encodings like iso2022-jp need to write an escape sequence - * after all characters have been converted. This logic checks that - * enough room is available in the buffer for the escape bytes. The - * TCL_ENCODING_END flag is cleared after a final escape sequence has - * been added to the buffer so that another call to this method does - * not attempt to append escape bytes a second time. + * Certain encodings like iso2022-jp need to write + * an escape sequence after all characters have + * been converted. This logic checks that enough + * room is available in the buffer for the escape bytes. + * The TCL_ENCODING_END flag is cleared after a final + * escape sequence has been added to the buffer so + * that another call to this method does not attempt + * to append escape bytes a second time. */ - if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { if (state) { - memcpy(dst, dataPtr->subTables[0].sequence, len); + memcpy(dst, dataPtr->subTables[0].sequence, (size_t) len); dst += len; } - memcpy(dst, dataPtr->final, dataPtr->finalLen); + memcpy(dst, dataPtr->final, (size_t) dataPtr->finalLen); dst += dataPtr->finalLen; state &= ~TCL_ENCODING_END; } @@ -4211,50 +3373,50 @@ EscapeFromUtfProc( * * EscapeFreeProc -- * - * Frees resources used by the encoding. + * This function is invoked when an EscapeEncodingData encoding is + * deleted. It deletes the memory used by the encoding. * * Results: * None. * * Side effects: - * Memory is freed. + * Memory freed. * *--------------------------------------------------------------------------- */ static void EscapeFreeProc( - void *clientData) /* EscapeEncodingData that specifies + ClientData clientData) /* EscapeEncodingData that specifies * encoding. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; + EscapeEncodingData *dataPtr; EscapeSubTable *subTablePtr; int i; + dataPtr = (EscapeEncodingData *) clientData; if (dataPtr == NULL) { return; } - /* - * The subTables should be freed recursively in normal operation but not - * during TclFinalizeEncodingSubsystem because they are also present as a - * weak reference in the toplevel encodingTable (i.e., they don't have a - * +1 refcount for this), and unpredictable nuking order could remove them - * from under the following loop's feet. [Bug 2891556] - * - * The encodingsInitialized flag, being reset on entry to TFES, can serve - * as a "not in finalization" test. + * The subTables should be freed recursively in normal operation but not + * during TclFinalizeEncodingSubsystem because they are also present as a + * weak reference in the toplevel encodingTable (ie they don't have a +1 + * refcount for this), and unpredictable nuking order could remove them + * from under the following loop's feet [Bug 2891556]. + * + * The encodingsInitialized flag, being reset on entry to TFES, can serve + * as a "not in finalization" test. */ - - if (encodingsInitialized) { - subTablePtr = dataPtr->subTables; - for (i = 0; i < dataPtr->numSubTables; i++) { - FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr); - subTablePtr->encodingPtr = NULL; - subTablePtr++; + if (encodingsInitialized) + { + subTablePtr = dataPtr->subTables; + for (i = 0; i < dataPtr->numSubTables; i++) { + FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr); + subTablePtr++; + } } - } - ckfree(dataPtr); + ckfree((char *) dataPtr); } /* @@ -4282,8 +3444,11 @@ GetTableEncoding( EscapeEncodingData *dataPtr,/* Contains names of encodings. */ int state) /* Index in dataPtr of desired Encoding. */ { - EscapeSubTable *subTablePtr = &dataPtr->subTables[state]; - Encoding *encodingPtr = subTablePtr->encodingPtr; + EscapeSubTable *subTablePtr; + Encoding *encodingPtr; + + subTablePtr = &dataPtr->subTables[state]; + encodingPtr = subTablePtr->encodingPtr; if (encodingPtr == NULL) { encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name); @@ -4301,7 +3466,7 @@ GetTableEncoding( /* *--------------------------------------------------------------------------- * - * unilen, unilen4 -- + * unilen -- * * A helper function for the Tcl_ExternalToUtf functions. This function * is similar to strlen for double-byte characters: it returns the number @@ -4328,19 +3493,6 @@ unilen( } return (char *) p - src; } - -static size_t -unilen4( - const char *src) -{ - unsigned int *p; - - p = (unsigned int *) src; - while (*p != 0x00000000) { - p++; - } - return (char *) p - src; -} /* *------------------------------------------------------------------------- @@ -4368,170 +3520,53 @@ unilen4( static void InitializeEncodingSearchPath( char **valuePtr, - TCL_HASH_TYPE *lengthPtr, + int *lengthPtr, Tcl_Encoding *encodingPtr) { - const char *bytes; - Tcl_Size i, numDirs, numBytes; - Tcl_Obj *libPathObj, *encodingObj, *searchPathObj; + char *bytes; + int i, numDirs, numBytes; + Tcl_Obj *libPath, *encodingObj, *searchPath; TclNewLiteralStringObj(encodingObj, "encoding"); - TclNewObj(searchPathObj); + TclNewObj(searchPath); Tcl_IncrRefCount(encodingObj); - Tcl_IncrRefCount(searchPathObj); - libPathObj = TclGetLibraryPath(); - Tcl_IncrRefCount(libPathObj); - TclListObjLength(NULL, libPathObj, &numDirs); + Tcl_IncrRefCount(searchPath); + libPath = TclGetLibraryPath(); + Tcl_IncrRefCount(libPath); + Tcl_ListObjLength(NULL, libPath, &numDirs); for (i = 0; i < numDirs; i++) { - Tcl_Obj *directoryObj, *pathObj; + Tcl_Obj *directory, *path; Tcl_StatBuf stat; - Tcl_ListObjIndex(NULL, libPathObj, i, &directoryObj); - pathObj = Tcl_FSJoinToPath(directoryObj, 1, &encodingObj); - Tcl_IncrRefCount(pathObj); - if ((0 == Tcl_FSStat(pathObj, &stat)) && S_ISDIR(stat.st_mode)) { - Tcl_ListObjAppendElement(NULL, searchPathObj, pathObj); + Tcl_ListObjIndex(NULL, libPath, i, &directory); + path = Tcl_FSJoinToPath(directory, 1, &encodingObj); + Tcl_IncrRefCount(path); + if ((0 == Tcl_FSStat(path, &stat)) && S_ISDIR(stat.st_mode)) { + Tcl_ListObjAppendElement(NULL, searchPath, path); } - Tcl_DecrRefCount(pathObj); + Tcl_DecrRefCount(path); } - Tcl_DecrRefCount(libPathObj); + Tcl_DecrRefCount(libPath); Tcl_DecrRefCount(encodingObj); *encodingPtr = libraryPath.encoding; if (*encodingPtr) { ((Encoding *)(*encodingPtr))->refCount++; } - bytes = TclGetStringFromObj(searchPathObj, &numBytes); + bytes = Tcl_GetStringFromObj(searchPath, &numBytes); *lengthPtr = numBytes; - *valuePtr = (char *)ckalloc(numBytes + 1); - memcpy(*valuePtr, bytes, numBytes + 1); - Tcl_DecrRefCount(searchPathObj); -} - -/* - *------------------------------------------------------------------------ - * - * TclEncodingProfileParseName -- - * - * Maps an encoding profile name to its integer equivalent. - * - * Results: - * TCL_OK on success or TCL_ERROR on failure. - * - * Side effects: - * Returns the profile enum value in *profilePtr - * - *------------------------------------------------------------------------ - */ -int -TclEncodingProfileNameToId( - Tcl_Interp *interp, /* For error messages. May be NULL */ - const char *profileName, /* Name of profile */ - int *profilePtr) /* Output */ -{ - size_t i; - size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); - - for (i = 0; i < numProfiles; ++i) { - if (!strcmp(profileName, encodingProfiles[i].name)) { - *profilePtr = encodingProfiles[i].value; - return TCL_OK; - } - } - if (interp) { - Tcl_Obj *errorObj; - /* This code assumes at least two profiles :-) */ - errorObj = - Tcl_ObjPrintf("bad profile name \"%s\": must be", - profileName); - for (i = 0; i < (numProfiles - 1); ++i) { - Tcl_AppendStringsToObj( - errorObj, " ", encodingProfiles[i].name, ",", (void *)NULL); - } - Tcl_AppendStringsToObj( - errorObj, " or ", encodingProfiles[numProfiles-1].name, (void *)NULL); - - Tcl_SetObjResult(interp, errorObj); - Tcl_SetErrorCode( - interp, "TCL", "ENCODING", "PROFILE", profileName, (void *)NULL); - } - return TCL_ERROR; -} - -/* - *------------------------------------------------------------------------ - * - * TclEncodingProfileValueToName -- - * - * Maps an encoding profile value to its name. - * - * Results: - * Pointer to the name or NULL on failure. Caller must not make - * not modify the string and must make a copy to hold on to it. - * - * Side effects: - * None. - *------------------------------------------------------------------------ - */ -const char * -TclEncodingProfileIdToName( - Tcl_Interp *interp, /* For error messages. May be NULL */ - int profileValue) /* Profile #define value */ -{ - size_t i; - - for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { - if (profileValue == encodingProfiles[i].value) { - return encodingProfiles[i].name; - } - } - if (interp) { - Tcl_SetObjResult( - interp, - Tcl_ObjPrintf( - "Internal error. Bad profile id \"%d\".", - profileValue)); - Tcl_SetErrorCode( - interp, "TCL", "ENCODING", "PROFILEID", (void *)NULL); - } - return NULL; + *valuePtr = ckalloc((unsigned int) numBytes + 1); + memcpy(*valuePtr, bytes, (size_t) numBytes + 1); + Tcl_DecrRefCount(searchPath); } /* - *------------------------------------------------------------------------ - * - * TclGetEncodingProfiles -- - * - * Get the list of supported encoding profiles. - * - * Results: - * None. - * - * Side effects: - * The list of profile names is stored in the interpreter result. - * - *------------------------------------------------------------------------ - */ -void -TclGetEncodingProfiles(Tcl_Interp *interp) -{ - size_t i, n; - Tcl_Obj *objPtr; - n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); - objPtr = Tcl_NewListObj(n, NULL); - for (i = 0; i < n; ++i) { - Tcl_ListObjAppendElement( - interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE)); - } - Tcl_SetObjResult(interp, objPtr); -} - -/* * Local Variables: * mode: c * c-basic-offset: 4 * fill-column: 78 * End: */ + |
