diff options
Diffstat (limited to 'generic/tclUnicodeObj.c')
-rw-r--r-- | generic/tclUnicodeObj.c | 771 |
1 files changed, 771 insertions, 0 deletions
diff --git a/generic/tclUnicodeObj.c b/generic/tclUnicodeObj.c new file mode 100644 index 0000000..869b8c7 --- /dev/null +++ b/generic/tclUnicodeObj.c @@ -0,0 +1,771 @@ +/* + * tclUnicodeObj.c -- + * + * This file contains the implementation of the Unicode internal + * representation of Tcl objects. + * + * Copyright (c) 1999 by Scriptics Corporation. + * + * See the file "license.terms" for information on usage and redistribution + * of this file, and for a DISCLAIMER OF ALL WARRANTIES. + * + * RCS: @(#) $Id: tclUnicodeObj.c,v 1.2 1999/06/08 02:59:27 hershey Exp $ + */ + +#include <math.h> +#include "tclInt.h" +#include "tclPort.h" + +/* + * Prototypes for local procedures defined in this file: + */ + +static void DupUnicodeInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr, + Tcl_Obj *copyPtr)); +static void FreeUnicodeInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr)); +static void UpdateStringOfUnicode _ANSI_ARGS_((Tcl_Obj *objPtr)); +static int SetUnicodeFromAny _ANSI_ARGS_((Tcl_Interp *interp, + Tcl_Obj *objPtr)); + +static int AllSingleByteChars _ANSI_ARGS_((Tcl_Obj *objPtr)); +static void TclAppendUniCharStrToObj _ANSI_ARGS_(( + register Tcl_Obj *objPtr, Tcl_UniChar *unichars, + int numChars)); +static Tcl_Obj * TclNewUnicodeObj _ANSI_ARGS_((Tcl_UniChar *unichars, + int numChars)); +static void SetOptUnicodeFromAny _ANSI_ARGS_((Tcl_Obj *objPtr, + int numChars)); + +/* + * The following object type represents a Unicode string. A Unicode string + * is an internationalized string. Conceptually, a Unicode string is an + * array of 16-bit quantities organized as a sequence of properly formed + * UTF-8 characters. There is a one-to-one map between Unicode and UTF + * characters. The Unicode ojbect is opitmized for the case where each UTF + * char in a string is only one byte. In this case, we store the value of + * numChars, but we don't copy the bytes to the unicodeObj->chars. Before + * accessing obj->chars, check if unicodeObj->numChars == obj->length. + */ + +Tcl_ObjType tclUnicodeType = { + "unicode", + FreeUnicodeInternalRep, + DupUnicodeInternalRep, + UpdateStringOfUnicode, + SetUnicodeFromAny +}; + +/* + * The following structure is the internal rep for a Unicode object. + * Keeps track of how much memory has been used and how much has been + * allocated for the Unicode to enable growing and shrinking of the + * Unicode object with fewer mallocs. + */ + +typedef struct Unicode { + int numChars; /* The number of chars in the unicode + * string. */ + int used; /* The number of bytes used in the unicode + * string. */ + int allocated; /* The amount of space actually allocated + * minus 1 byte. */ + unsigned char chars[4]; /* The array of chars. The actual size of + * this field depends on the 'allocated' field + * above. */ +} Unicode; + +#define UNICODE_SIZE(len) \ + ((unsigned) (sizeof(Unicode) - 4 + (len))) +#define GET_UNICODE(objPtr) \ + ((Unicode *) (objPtr)->internalRep.otherValuePtr) +#define SET_UNICODE(objPtr, unicodePtr) \ + (objPtr)->internalRep.otherValuePtr = (VOID *) (unicodePtr) + + +/* + *---------------------------------------------------------------------- + * + * TclGetUnicodeLengthFromObj -- + * + * Get the length of the Unicode string from the Tcl object. If + * the object is not already a Unicode object, an attempt will be + * made to convert it to one. + * + * Results: + * Pointer to unicode string representing the unicode object. + * + * Side effects: + * Frees old internal rep. Allocates memory for new internal rep. + * + *---------------------------------------------------------------------- + */ + +int +TclGetUnicodeLengthFromObj(objPtr) + Tcl_Obj *objPtr; /* The Unicode object. */ +{ + int length; + Unicode *unicodePtr; + + SetUnicodeFromAny(NULL, objPtr); + unicodePtr = GET_UNICODE(objPtr); + + length = unicodePtr->numChars; + return length; +} + +/* + *---------------------------------------------------------------------- + * + * TclGetUniCharFromObj -- + * + * Get the index'th Unicode character from the Unicode object. If + * the object is not already a Unicode object, an attempt will be + * made to convert it to one. The index is assumed to be in the + * appropriate range. + * + * Results: + * Returns the index'th Unicode character in the Object. + * + * Side effects: + * Fills unichar with the index'th Unicode character. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +TclGetUniCharFromObj(objPtr, index) + Tcl_Obj *objPtr; /* The Unicode object. */ + int index; /* Get the index'th character. */ +{ + Tcl_UniChar *unicharPtr, unichar; + Unicode *unicodePtr; + int length; + + SetUnicodeFromAny(NULL, objPtr); + unicodePtr = GET_UNICODE(objPtr); + length = objPtr->length; + + if (AllSingleByteChars(objPtr)) { + int length; + char *str; + + /* + * All of the characters in the Utf string are 1 byte chars, + * so we don't store the unicode char. We get the Utf string + * and convert the index'th byte to a Unicode character. + */ + + str = Tcl_GetStringFromObj(objPtr, &length); + Tcl_UtfToUniChar(&str[index], &unichar); + } else { + unicharPtr = (Tcl_UniChar *)unicodePtr->chars; + unichar = unicharPtr[index]; + } + return unichar; +} + +/* + *---------------------------------------------------------------------- + * + * TclGetRangeFromObj -- + * + * Create a Tcl Object that contains the chars between first and + * last of the object indicated by "objPtr". If the object is not + * already a Unicode object, an attempt will be made to convert it + * to one. The first and last indices are assumed to be in the + * appropriate range. + * + * Results: + * Returns a new Tcl Object of either "string" or "unicode" type, + * containing the range of chars. + * + * Side effects: + * Changes the internal rep of "objPtr" to unicode. + * + *---------------------------------------------------------------------- + */ + +Tcl_Obj* +TclGetRangeFromObj(objPtr, first, last) + + Tcl_Obj *objPtr; /* The Tcl object to find the range of. */ + int first; /* First index of the range. */ + int last; /* Last index of the range. */ +{ + Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ + Tcl_UniChar *unicharPtr; + Unicode *unicodePtr; + int length; + + SetUnicodeFromAny(NULL, objPtr); + unicodePtr = GET_UNICODE(objPtr); + length = objPtr->length; + + if (unicodePtr->numChars != length) { + unicharPtr = (Tcl_UniChar *)unicodePtr->chars; + newObjPtr = TclNewUnicodeObj(&unicharPtr[first], last-first+1); + } else { + int length; + char *str; + + /* + * All of the characters in the Utf string are 1 byte chars, + * so we don't store the unicode char. Create a new string + * object containing the specified range of chars. + */ + + str = Tcl_GetStringFromObj(objPtr, &length); + newObjPtr = Tcl_NewStringObj(&str[first], last-first+1); + } + return newObjPtr; +} + +/* + *---------------------------------------------------------------------- + * + * TclAppendObjToUnicodeObj -- + * + * This procedure appends the contest of "srcObjPtr" to the Unicode + * object "destPtr". + * + * Results: + * None. + * + * Side effects: + * If srcObjPtr doesn't have an internal rep, then it is given a + * Unicode internal rep. + * + *---------------------------------------------------------------------- + */ + +Tcl_Obj * +TclAppendObjToUnicodeObj(targetObjPtr, srcObjPtr) + register Tcl_Obj *targetObjPtr; /* Points to the object to + * append to. */ + register Tcl_Obj *srcObjPtr; /* Points to the object to + * append from. */ +{ + int numBytes, numChars; + Tcl_Obj *resultObjPtr; + char *utfSrcStr; + Tcl_UniChar *unicharSrcStr; + Unicode *unicodePtr; + Tcl_DString dsPtr; + + /* + * Duplicate the target if it is shared. + * Change the result's internal rep to Unicode object. + */ + + if (Tcl_IsShared(targetObjPtr)) { + resultObjPtr = Tcl_DuplicateObj(targetObjPtr); + } else { + resultObjPtr = targetObjPtr; + } + SetUnicodeFromAny(NULL, resultObjPtr); + + /* + * Case where target chars are 1 byte long: + * If src obj is of "string" or null type, then convert it to "unicode" + * type. Src objs of other types (such as int) are left in tact to keep + * them from shimmering between types. If the src obj is a unichar obj, + * and all src chars are also 1 byte long, the src string is appended to + * the target "unicode" obj, and the target obj maintains its "optimized" + * status. + */ + + if (AllSingleByteChars(resultObjPtr)) { + + int length; + char *stringRep; + + if (srcObjPtr->typePtr == &tclStringType + || srcObjPtr->typePtr == NULL) { + SetUnicodeFromAny(NULL, srcObjPtr); + } + + stringRep = Tcl_GetStringFromObj(srcObjPtr, &length); + Tcl_AppendToObj(resultObjPtr, stringRep, length); + + if ((srcObjPtr->typePtr == &tclUnicodeType) + && (AllSingleByteChars(srcObjPtr))) { + SetOptUnicodeFromAny(resultObjPtr, resultObjPtr->length); + } + return resultObjPtr; + } + + /* + * Extract a unicode string from "unicode" or "string" type objects. + * Extract the utf string from non-unicode objects, and convert the + * utf string to unichar string locally. + * If the src obj is a "string" obj, convert it to "unicode" type. + * Src objs of other types (such as int) are left in tact to keep + * them from shimmering between types. + */ + + Tcl_DStringInit(&dsPtr); + if (srcObjPtr->typePtr == &tclStringType || srcObjPtr->typePtr == NULL) { + SetUnicodeFromAny(NULL, srcObjPtr); + } + if (srcObjPtr->typePtr == &tclUnicodeType) { + if (AllSingleByteChars(srcObjPtr)) { + + unicodePtr = GET_UNICODE(srcObjPtr); + numChars = unicodePtr->numChars; + + utfSrcStr = Tcl_GetStringFromObj(srcObjPtr, &numBytes); + unicharSrcStr = (Tcl_UniChar *)Tcl_UtfToUniCharDString(utfSrcStr, + numBytes, &dsPtr); + } else { + unicodePtr = GET_UNICODE(srcObjPtr); + numChars = unicodePtr->numChars; + unicharSrcStr = (Tcl_UniChar *)unicodePtr->chars; + } + } else { + utfSrcStr = Tcl_GetStringFromObj(srcObjPtr, &numBytes); + numChars = Tcl_NumUtfChars(utfSrcStr, numBytes); + unicharSrcStr = (Tcl_UniChar *)Tcl_UtfToUniCharDString(utfSrcStr, + numBytes, &dsPtr); + } + if (numChars == 0) { + return resultObjPtr; + } + + /* + * Append the unichar src string to the result object. + */ + + TclAppendUniCharStrToObj(resultObjPtr, unicharSrcStr, numChars); + Tcl_DStringFree(&dsPtr); + return resultObjPtr; +} + +/* + *---------------------------------------------------------------------- + * + * TclAppendUniCharStrToObj -- + * + * This procedure appends the contents of "srcObjPtr" to the + * Unicode object "objPtr". + * + * Results: + * None. + * + * Side effects: + * If srcObjPtr doesn't have an internal rep, then it is given a + * Unicode internal rep. + * + *---------------------------------------------------------------------- + */ + +void +TclAppendUniCharStrToObj(objPtr, unichars, numNewChars) + register Tcl_Obj *objPtr; /* Points to the object to append to. */ + Tcl_UniChar *unichars; /* The unicode string to append to the + * object. */ + int numNewChars; /* Number of chars in "unichars". */ +{ + Unicode *unicodePtr; + int usedBytes, numNewBytes, totalNumBytes, totalNumChars; + + /* + * Invalidate the StringRep. + */ + + Tcl_InvalidateStringRep(objPtr); + + unicodePtr = GET_UNICODE(objPtr); + + usedBytes = unicodePtr->used; + totalNumChars = numNewChars + unicodePtr->numChars; + totalNumBytes = totalNumChars * sizeof(Tcl_UniChar); + numNewBytes = numNewChars * sizeof(Tcl_UniChar); + + if (unicodePtr->allocated < totalNumBytes) { + int allocatedBytes = totalNumBytes * 2; + + /* + * There isn't currently enough space in the Unicode + * representation so allocate additional space. In fact, + * overallocate so that there is room for future growth without + * having to reallocate again. + */ + + unicodePtr = (Unicode *) ckrealloc(unicodePtr, + UNICODE_SIZE(allocatedBytes)); + memcpy((VOID *) (unicodePtr->chars + usedBytes), + (VOID *) unichars, (size_t) numNewBytes); + + unicodePtr->allocated = allocatedBytes; + unicodePtr = SET_UNICODE(objPtr, unicodePtr); + } + + memcpy((VOID *) (unicodePtr->chars + usedBytes), + (VOID *) unichars, (size_t) numNewBytes); + unicodePtr->used = totalNumBytes; + unicodePtr->numChars = totalNumChars; +} + +/* + *--------------------------------------------------------------------------- + * + * TclNewUnicodeObj -- + * + * This procedure is creates a new Unicode object and initializes + * it from the given Utf String. If the Utf String is the same size + * as the Unicode string, don't duplicate the data. + * + * Results: + * The newly created object is returned. This object will have no + * initial string representation. The returned object has a ref count + * of 0. + * + * Side effects: + * Memory allocated for new object and copy of Unicode argument. + * + *--------------------------------------------------------------------------- + */ + +Tcl_Obj * +TclNewUnicodeObj(unichars, numChars) + Tcl_UniChar *unichars; /* The unicode string used to initialize + * the new object. */ + int numChars; /* Number of characters in the unicode + * string. */ +{ + Tcl_Obj *objPtr; + Unicode *unicodePtr; + int numBytes; + + numBytes = numChars * sizeof(Tcl_UniChar); + + TclNewObj(objPtr); + objPtr->bytes = NULL; + objPtr->typePtr = &tclUnicodeType; + + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(numBytes)); + unicodePtr->used = numBytes; + unicodePtr->numChars = numChars; + unicodePtr->allocated = numBytes; + memcpy((VOID *) unicodePtr->chars, (VOID *) unichars, (size_t) numBytes); + SET_UNICODE(objPtr, unicodePtr); + return objPtr; +} + +/* + *--------------------------------------------------------------------------- + * + * TclAllSingleByteChars -- + * + * Initialize the internal representation of a Unicode Tcl_Obj + * to a copy of the internal representation of an existing Unicode + * object. + * + * Results: + * None. + * + * Side effects: + * Allocates memory. + * + *--------------------------------------------------------------------------- + */ + +static int +AllSingleByteChars(objPtr) + Tcl_Obj *objPtr; /* Object whose char lengths to check. */ +{ + Unicode *unicodePtr; + int numBytes, numChars; + + unicodePtr = GET_UNICODE(objPtr); + numChars = unicodePtr->numChars; + numBytes = objPtr->length; + + if (numChars == numBytes) { + return 1; + } else { + return 0; + } +} + +/* + *--------------------------------------------------------------------------- + * + * DupUnicodeInternalRep -- + * + * Initialize the internal representation of a Unicode Tcl_Obj + * to a copy of the internal representation of an existing Unicode + * object. + * + * Results: + * None. + * + * Side effects: + * Allocates memory. + * + *--------------------------------------------------------------------------- + */ + +static void +DupUnicodeInternalRep(srcPtr, copyPtr) + Tcl_Obj *srcPtr; /* Object with internal rep to copy. */ + Tcl_Obj *copyPtr; /* Object with internal rep to set. */ +{ + Unicode *srcUnicodePtr = GET_UNICODE(srcPtr); + Unicode *copyUnicodePtr; /*GET_UNICODE(copyPtr);*/ + + /* + * If the src obj is a string of 1-byte Utf chars, then copy the + * string rep of the source object and create an "empty" Unicode + * internal rep for the new object. Otherwise, copy Unicode + * internal rep, and invalidate the string rep of the new object. + */ + + if (AllSingleByteChars(srcPtr)) { + copyUnicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(4)); + } else { + int used = srcUnicodePtr->used; + int allocated = srcUnicodePtr->allocated; + Tcl_UniChar *unichars; + + unichars = (Tcl_UniChar *)srcUnicodePtr->chars; + + copyUnicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(allocated)); + + copyUnicodePtr->used = used; + copyUnicodePtr->allocated = allocated; + memcpy((VOID *) copyUnicodePtr->chars, + (VOID *) srcUnicodePtr->chars, (size_t) used); + } + copyUnicodePtr->numChars = srcUnicodePtr->numChars; + SET_UNICODE(copyPtr, copyUnicodePtr); +} + +/* + *--------------------------------------------------------------------------- + * + * TclSetUnicodeObj -- + * + * Modify an object to be a Unicode object and to have the specified + * unicode string as its value. + * + * Results: + * None. + * + * Side effects: + * The object's old string rep and internal rep is freed. + * Memory allocated for copy of unicode argument. + * + *---------------------------------------------------------------------- + */ + +void +TclSetUnicodeObj(objPtr, chars, length) + Tcl_Obj *objPtr; /* Object to initialize as a Unicode obj. */ + unsigned char *chars; /* The unicode string to use as the new + * value. */ + int length; /* Length of the unicode string, which must + * be >= 0. */ +{ + Tcl_ObjType *typePtr; + Unicode *unicodePtr; + + if (Tcl_IsShared(objPtr)) { + panic("TclSetUnicodeObj called with shared object"); + } + typePtr = objPtr->typePtr; + if ((typePtr != NULL) && (typePtr->freeIntRepProc != NULL)) { + (*typePtr->freeIntRepProc)(objPtr); + } + Tcl_InvalidateStringRep(objPtr); + + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(length)); + unicodePtr->used = length; + unicodePtr->allocated = length; + memcpy((VOID *) unicodePtr->chars, (VOID *) chars, (size_t) length); + + objPtr->typePtr = &tclUnicodeType; + SET_UNICODE(objPtr, unicodePtr); +} + +/* + *--------------------------------------------------------------------------- + * + * UpdateStringOfUnicode -- + * + * Update the string representation for a Unicode data object. + * Note: This procedure does not invalidate an existing old string rep + * so storage will be lost if this has not already been done. + * + * Results: + * None. + * + * Side effects: + * The object's string is set to a valid string that results from + * the Unicode-to-string conversion. + * + * The object becomes a string object -- the internal rep is + * discarded and the typePtr becomes NULL. + * + *--------------------------------------------------------------------------- + */ + +static void +UpdateStringOfUnicode(objPtr) + Tcl_Obj *objPtr; /* Unicode object whose string rep to + * update. */ +{ + int i, length, size; + Tcl_UniChar *src; + char dummy[TCL_UTF_MAX]; + char *dst; + Unicode *unicodePtr; + + unicodePtr = GET_UNICODE(objPtr); + src = (Tcl_UniChar *) unicodePtr->chars; + length = unicodePtr->used; + + /* + * How much space will string rep need? + */ + + size = 0; + for (i = 0; i < unicodePtr->numChars; i++) { + size += Tcl_UniCharToUtf((int) src[i], dummy); + } + + dst = (char *) ckalloc((unsigned) (size + 1)); + objPtr->bytes = dst; + objPtr->length = size; + + for (i = 0; i < unicodePtr->numChars; i++) { + dst += Tcl_UniCharToUtf(src[i], dst); + } + *dst = '\0'; +} + +/* + *--------------------------------------------------------------------------- + * + * SetOptUnicodeFromAny -- + * + * Generate the Unicode internal rep from the string rep. + * + * Results: + * The return value is always TCL_OK. + * + * Side effects: + * A Unicode object is stored as the internal rep of objPtr. The Unicode + * ojbect is opitmized for the case where each UTF char in a string is only + * one byte. In this case, we store the value of numChars, but we don't copy + * the bytes to the unicodeObj->chars. Before accessing obj->chars, check if + * all chars are 1 byte long. + * + *--------------------------------------------------------------------------- + */ + +static void +SetOptUnicodeFromAny(objPtr, numChars) + Tcl_Obj *objPtr; /* The object to convert to type Unicode. */ + int numChars; +{ + Tcl_ObjType *typePtr; + Unicode *unicodePtr; + + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(4)); + unicodePtr->numChars = numChars; + + typePtr = objPtr->typePtr; + if ((typePtr != NULL) && (typePtr->freeIntRepProc) != NULL) { + (*typePtr->freeIntRepProc)(objPtr); + } + objPtr->typePtr = &tclUnicodeType; + SET_UNICODE(objPtr, unicodePtr); +} + +/* + *--------------------------------------------------------------------------- + * + * SetUnicodeFromAny -- + * + * Generate the Unicode internal rep from the string rep. + * + * Results: + * The return value is always TCL_OK. + * + * Side effects: + * A Unicode object is stored as the internal rep of objPtr. The Unicode + * ojbect is opitmized for the case where each UTF char in a string is only + * one byte. In this case, we store the value of numChars, but we don't copy + * the bytes to the unicodeObj->chars. Before accessing obj->chars, check if + * all chars are 1 byte long. + * + *--------------------------------------------------------------------------- + */ + +static int +SetUnicodeFromAny(interp, objPtr) + Tcl_Interp *interp; /* Not used. */ + Tcl_Obj *objPtr; /* The object to convert to type Unicode. */ +{ + Tcl_ObjType *typePtr; + int numBytes, numChars; + char *src, *srcEnd; + Unicode *unicodePtr; + unsigned char *dst; + + typePtr = objPtr->typePtr; + if (typePtr != &tclUnicodeType) { + src = Tcl_GetStringFromObj(objPtr, &numBytes); + + numChars = Tcl_NumUtfChars(src, numBytes); + if (numChars == numBytes) { + SetOptUnicodeFromAny(objPtr, numChars); + } else { + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(numChars + * sizeof(Tcl_UniChar))); + srcEnd = src + numBytes; + + for (dst = unicodePtr->chars; src < srcEnd; + dst += sizeof(Tcl_UniChar)) { + src += Tcl_UtfToUniChar(src, (Tcl_UniChar *) dst); + } + + unicodePtr->used = numChars * sizeof(Tcl_UniChar); + unicodePtr->numChars = numChars; + unicodePtr->allocated = numChars * sizeof(Tcl_UniChar); + + if ((typePtr != NULL) && (typePtr->freeIntRepProc) != NULL) { + (*typePtr->freeIntRepProc)(objPtr); + } + objPtr->typePtr = &tclUnicodeType; + SET_UNICODE(objPtr, unicodePtr); + } + } + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * + * FreeUnicodeInternalRep -- + * + * Deallocate the storage associated with a Unicode data object's + * internal representation. + * + * Results: + * None. + * + * Side effects: + * Frees memory. + * + *---------------------------------------------------------------------- + */ + +static void +FreeUnicodeInternalRep(objPtr) + Tcl_Obj *objPtr; /* Object with internal rep to free. */ +{ + ckfree((char *) GET_UNICODE(objPtr)); +} |