summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c1654
1 files changed, 1252 insertions, 402 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 92217f3..fc2835d 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -3,13 +3,14 @@
*
* Contains the implementation of the encoding conversion package.
*
- * Copyright (c) 1996-1998 Sun Microsystems, Inc.
+ * Copyright © 1996-1998 Sun Microsystems, Inc.
*
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
*/
#include "tclInt.h"
+#include "tclIO.h"
typedef size_t (LengthProc)(const char *src);
@@ -33,20 +34,22 @@ typedef struct {
Tcl_EncodingFreeProc *freeProc;
/* If non-NULL, function to call when this
* encoding is deleted. */
- int nullSize; /* Number of 0x00 bytes that signify
+ void *clientData; /* Arbitrary value associated with encoding
+ * type. Passed to conversion functions. */
+ Tcl_Size nullSize; /* Number of 0x00 bytes that signify
* end-of-string in this encoding. This number
* is used to determine the source string
* length when the srcLen argument is
- * negative. This number can be 1 or 2. */
- ClientData clientData; /* Arbitrary value associated with encoding
- * type. Passed to conversion functions. */
+ * negative. This number can be 1, 2, or 4. */
LengthProc *lengthProc; /* Function to compute length of
* null-terminated strings in this encoding.
* If nullSize is 1, this is strlen; if
* nullSize is 2, this is a function that
* returns the number of bytes in a 0x0000
- * terminated string. */
- int refCount; /* Number of uses of this structure. */
+ * terminated string; if nullSize is 4, this
+ * is a function that returns the number of
+ * bytes in a 0x00000000 terminated string. */
+ size_t refCount; /* Number of uses of this structure. */
Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */
} Encoding;
@@ -185,6 +188,29 @@ static Tcl_Encoding systemEncoding = NULL;
Tcl_Encoding tclIdentityEncoding = NULL;
/*
+ * Names of encoding profiles and corresponding integer values.
+ * Keep alphabetical order for error messages.
+ */
+static struct TclEncodingProfiles {
+ const char *name;
+ int value;
+} encodingProfiles[] = {
+ {"replace", TCL_ENCODING_PROFILE_REPLACE},
+ {"strict", TCL_ENCODING_PROFILE_STRICT},
+ {"tcl8", TCL_ENCODING_PROFILE_TCL8},
+};
+#define PROFILE_STRICT(flags_) \
+ (CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT)
+
+#define PROFILE_REPLACE(flags_) \
+ (CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE)
+
+#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD)
+#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800)
+#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800)
+#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00)
+
+/*
* The following variable is used in the sparse matrix code for a
* TableEncoding to represent a page in the table that has no entries.
*/
@@ -214,51 +240,19 @@ static Tcl_Encoding LoadEscapeEncoding(const char *name,
static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp,
const char *name);
static Tcl_EncodingFreeProc TableFreeProc;
-static int TableFromUtfProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst, int dstLen,
- int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr);
-static int TableToUtfProc(ClientData clientData, const char *src,
- int srcLen, int flags, Tcl_EncodingState *statePtr,
- char *dst, int dstLen, int *srcReadPtr,
- int *dstWrotePtr, int *dstCharsPtr);
+static Tcl_EncodingConvertProc TableFromUtfProc;
+static Tcl_EncodingConvertProc TableToUtfProc;
static size_t unilen(const char *src);
-static int UnicodeToUtfProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst, int dstLen,
- int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr);
-static int UtfToUnicodeProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst, int dstLen,
- int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr);
-static int UtfToUtfProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst, int dstLen,
- int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr, int pureNullMode);
-static int UtfIntToUtfExtProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst, int dstLen,
- int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr);
-static int UtfExtToUtfIntProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst, int dstLen,
- int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr);
-static int Iso88591FromUtfProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst, int dstLen,
- int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr);
-static int Iso88591ToUtfProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst,
- int dstLen, int *srcReadPtr, int *dstWrotePtr,
- int *dstCharsPtr);
+static size_t unilen4(const char *src);
+static Tcl_EncodingConvertProc Utf32ToUtfProc;
+static Tcl_EncodingConvertProc UtfToUtf32Proc;
+static Tcl_EncodingConvertProc Utf16ToUtfProc;
+static Tcl_EncodingConvertProc UtfToUtf16Proc;
+static Tcl_EncodingConvertProc UtfToUcs2Proc;
+static Tcl_EncodingConvertProc UtfToUtfProc;
+static Tcl_EncodingConvertProc Iso88591FromUtfProc;
+static Tcl_EncodingConvertProc Iso88591ToUtfProc;
+
/*
* A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
@@ -267,8 +261,28 @@ static int Iso88591ToUtfProc(ClientData clientData,
*/
static const Tcl_ObjType encodingType = {
- "encoding", FreeEncodingInternalRep, DupEncodingInternalRep, NULL, NULL
+ "encoding",
+ FreeEncodingInternalRep,
+ DupEncodingInternalRep,
+ NULL,
+ NULL
};
+
+#define EncodingSetInternalRep(objPtr, encoding) \
+ do { \
+ Tcl_ObjInternalRep ir; \
+ ir.twoPtrValue.ptr1 = (encoding); \
+ ir.twoPtrValue.ptr2 = NULL; \
+ Tcl_StoreInternalRep((objPtr), &encodingType, &ir); \
+ } while (0)
+
+#define EncodingGetInternalRep(objPtr, encoding) \
+ do { \
+ const Tcl_ObjInternalRep *irPtr; \
+ irPtr = TclFetchInternalRep ((objPtr), &encodingType); \
+ (encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL; \
+ } while (0)
+
/*
*----------------------------------------------------------------------
@@ -295,17 +309,16 @@ Tcl_GetEncodingFromObj(
Tcl_Obj *objPtr,
Tcl_Encoding *encodingPtr)
{
+ Tcl_Encoding encoding;
const char *name = TclGetString(objPtr);
- if (objPtr->typePtr != &encodingType) {
- Tcl_Encoding encoding = Tcl_GetEncoding(interp, name);
-
+ EncodingGetInternalRep(objPtr, encoding);
+ if (encoding == NULL) {
+ encoding = Tcl_GetEncoding(interp, name);
if (encoding == NULL) {
return TCL_ERROR;
}
- TclFreeIntRep(objPtr);
- objPtr->internalRep.twoPtrValue.ptr1 = encoding;
- objPtr->typePtr = &encodingType;
+ EncodingSetInternalRep(objPtr, encoding);
}
*encodingPtr = Tcl_GetEncoding(NULL, name);
return TCL_OK;
@@ -325,8 +338,10 @@ static void
FreeEncodingInternalRep(
Tcl_Obj *objPtr)
{
- Tcl_FreeEncoding((Tcl_Encoding)objPtr->internalRep.twoPtrValue.ptr1);
- objPtr->typePtr = NULL;
+ Tcl_Encoding encoding;
+
+ EncodingGetInternalRep(objPtr, encoding);
+ Tcl_FreeEncoding(encoding);
}
/*
@@ -344,7 +359,8 @@ DupEncodingInternalRep(
Tcl_Obj *srcPtr,
Tcl_Obj *dupPtr)
{
- dupPtr->internalRep.twoPtrValue.ptr1 = Tcl_GetEncoding(NULL, srcPtr->bytes);
+ Tcl_Encoding encoding = Tcl_GetEncoding(NULL, TclGetString(srcPtr));
+ EncodingSetInternalRep(dupPtr, encoding);
}
/*
@@ -382,9 +398,9 @@ int
Tcl_SetEncodingSearchPath(
Tcl_Obj *searchPath)
{
- int dummy;
+ Tcl_Size dummy;
- if (TCL_ERROR == TclListObjLength(NULL, searchPath, &dummy)) {
+ if (TCL_ERROR == TclListObjLengthM(NULL, searchPath, &dummy)) {
return TCL_ERROR;
}
TclSetProcessGlobalValue(&encodingSearchPath, searchPath, NULL);
@@ -429,9 +445,9 @@ void
TclSetLibraryPath(
Tcl_Obj *path)
{
- int dummy;
+ Tcl_Size dummy;
- if (TCL_ERROR == TclListObjLength(NULL, path, &dummy)) {
+ if (TCL_ERROR == TclListObjLengthM(NULL, path, &dummy)) {
return;
}
TclSetProcessGlobalValue(&libraryPath, path, NULL);
@@ -465,22 +481,22 @@ TclSetLibraryPath(
static void
FillEncodingFileMap(void)
{
- int i, numDirs = 0;
+ Tcl_Size i, numDirs = 0;
Tcl_Obj *map, *searchPath;
searchPath = Tcl_GetEncodingSearchPath();
Tcl_IncrRefCount(searchPath);
- TclListObjLength(NULL, searchPath, &numDirs);
+ TclListObjLengthM(NULL, searchPath, &numDirs);
map = Tcl_NewDictObj();
Tcl_IncrRefCount(map);
- for (i = numDirs-1; i >= 0; i--) {
+ for (i = numDirs-1; i != TCL_INDEX_NONE; i--) {
/*
* Iterate backwards through the search path so as we overwrite
* entries found, we favor files earlier on the search path.
*/
- int j, numFiles;
+ Tcl_Size j, numFiles;
Tcl_Obj *directory, *matchFileList;
Tcl_Obj **filev;
Tcl_GlobTypeData readableFiles = {
@@ -494,7 +510,7 @@ FillEncodingFileMap(void)
Tcl_FSMatchInDirectory(NULL, matchFileList, directory, "*.enc",
&readableFiles);
- TclListObjGetElements(NULL, matchFileList, &numFiles, &filev);
+ TclListObjGetElementsM(NULL, matchFileList, &numFiles, &filev);
for (j=0; j<numFiles; j++) {
Tcl_Obj *encodingName, *fileObj;
@@ -529,6 +545,17 @@ FillEncodingFileMap(void)
*---------------------------------------------------------------------------
*/
+/*
+ * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS
+ * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this
+ * when adding bits. TODO - should really be defined in a single file.
+ *
+ * To prevent conflicting bits, only define bits within 0xff00 mask here.
+ */
+#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */
+#define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
+#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */
+
void
TclInitEncodingSubsystem(void)
{
@@ -540,12 +567,16 @@ TclInitEncodingSubsystem(void)
char c;
short s;
} isLe;
+ int leFlags;
if (encodingsInitialized) {
return;
}
+ /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */
isLe.s = 1;
+ leFlags = isLe.c ? TCL_ENCODING_LE : 0;
+
Tcl_MutexLock(&encodingMutex);
Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
Tcl_MutexUnlock(&encodingMutex);
@@ -556,7 +587,7 @@ TclInitEncodingSubsystem(void)
* properly formed stream.
*/
- type.encodingName = "identity";
+ type.encodingName = NULL;
type.toUtfProc = BinaryProc;
type.fromUtfProc = BinaryProc;
type.freeProc = NULL;
@@ -565,21 +596,63 @@ TclInitEncodingSubsystem(void)
tclIdentityEncoding = Tcl_CreateEncoding(&type);
type.encodingName = "utf-8";
- type.toUtfProc = UtfExtToUtfIntProc;
- type.fromUtfProc = UtfIntToUtfExtProc;
+ type.toUtfProc = UtfToUtfProc;
+ type.fromUtfProc = UtfToUtfProc;
type.freeProc = NULL;
type.nullSize = 1;
- type.clientData = NULL;
+ type.clientData = INT2PTR(ENCODING_UTF);
+ Tcl_CreateEncoding(&type);
+ type.clientData = INT2PTR(0);
+ type.encodingName = "cesu-8";
Tcl_CreateEncoding(&type);
- type.encodingName = "unicode";
- type.toUtfProc = UnicodeToUtfProc;
- type.fromUtfProc = UtfToUnicodeProc;
+ type.toUtfProc = Utf16ToUtfProc;
+ type.fromUtfProc = UtfToUcs2Proc;
type.freeProc = NULL;
type.nullSize = 2;
- type.clientData = INT2PTR(isLe.c);
+ type.encodingName = "ucs-2le";
+ type.clientData = INT2PTR(TCL_ENCODING_LE);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "ucs-2be";
+ type.clientData = INT2PTR(0);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "ucs-2";
+ type.clientData = INT2PTR(leFlags);
Tcl_CreateEncoding(&type);
+ type.toUtfProc = Utf32ToUtfProc;
+ type.fromUtfProc = UtfToUtf32Proc;
+ type.freeProc = NULL;
+ type.nullSize = 4;
+ type.encodingName = "utf-32le";
+ type.clientData = INT2PTR(TCL_ENCODING_LE);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-32be";
+ type.clientData = INT2PTR(0);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-32";
+ type.clientData = INT2PTR(leFlags);
+ Tcl_CreateEncoding(&type);
+
+ type.toUtfProc = Utf16ToUtfProc;
+ type.fromUtfProc = UtfToUtf16Proc;
+ type.freeProc = NULL;
+ type.nullSize = 2;
+ type.encodingName = "utf-16le";
+ type.clientData = INT2PTR(TCL_ENCODING_LE|ENCODING_UTF);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-16be";
+ type.clientData = INT2PTR(ENCODING_UTF);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-16";
+ type.clientData = INT2PTR(leFlags|ENCODING_UTF);
+ Tcl_CreateEncoding(&type);
+
+#ifndef TCL_NO_DEPRECATED
+ type.encodingName = "unicode";
+ Tcl_CreateEncoding(&type);
+#endif
+
/*
* Need the iso8859-1 encoding in order to process binary data, so force
* it to always be embedded. Note that this encoding *must* be a proper
@@ -686,13 +759,14 @@ TclFinalizeEncodingSubsystem(void)
*-------------------------------------------------------------------------
*/
+#if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9
const char *
Tcl_GetDefaultEncodingDir(void)
{
int numDirs;
Tcl_Obj *first, *searchPath = Tcl_GetEncodingSearchPath();
- TclListObjLength(NULL, searchPath, &numDirs);
+ TclListObjLengthM(NULL, searchPath, &numDirs);
if (numDirs == 0) {
return NULL;
}
@@ -723,12 +797,13 @@ Tcl_SetDefaultEncodingDir(
const char *path)
{
Tcl_Obj *searchPath = Tcl_GetEncodingSearchPath();
- Tcl_Obj *directory = Tcl_NewStringObj(path, -1);
+ Tcl_Obj *directory = Tcl_NewStringObj(path, TCL_INDEX_NONE);
searchPath = Tcl_DuplicateObj(searchPath);
Tcl_ListObjReplace(NULL, searchPath, 0, 0, 1, &directory);
Tcl_SetEncodingSearchPath(searchPath);
}
+#endif
/*
*-------------------------------------------------------------------------
@@ -834,9 +909,6 @@ FreeEncoding(
if (encodingPtr == NULL) {
return;
}
- if (encodingPtr->refCount<=0) {
- Tcl_Panic("FreeEncoding: refcount problem !!!");
- }
if (encodingPtr->refCount-- <= 1) {
if (encodingPtr->freeProc != NULL) {
encodingPtr->freeProc(encodingPtr->clientData);
@@ -856,7 +928,7 @@ FreeEncoding(
*
* Tcl_GetEncodingName --
*
- * Given an encoding, return the name that was used to constuct the
+ * Given an encoding, return the name that was used to construct the
* encoding.
*
* Results:
@@ -921,7 +993,7 @@ Tcl_GetEncodingNames(
Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
Tcl_CreateHashEntry(&table,
- Tcl_NewStringObj(encodingPtr->name, -1), &dummy);
+ Tcl_NewStringObj(encodingPtr->name, TCL_INDEX_NONE), &dummy);
}
Tcl_MutexUnlock(&encodingMutex);
@@ -951,6 +1023,33 @@ Tcl_GetEncodingNames(
}
/*
+ *-------------------------------------------------------------------------
+ *
+ * Tcl_GetEncodingNulLength --
+ *
+ * Given an encoding, return the number of nul bytes used for the
+ * string termination.
+ *
+ * Results:
+ * The number of nul bytes used for the string termination.
+ *
+ * Side effects:
+ * None.
+ *
+ *---------------------------------------------------------------------------
+ */
+Tcl_Size
+Tcl_GetEncodingNulLength(
+ Tcl_Encoding encoding)
+{
+ if (encoding == NULL) {
+ encoding = systemEncoding;
+ }
+
+ return ((Encoding *) encoding)->nullSize;
+}
+
+/*
*------------------------------------------------------------------------
*
* Tcl_SetSystemEncoding --
@@ -1035,9 +1134,26 @@ Tcl_CreateEncoding(
const Tcl_EncodingType *typePtr)
/* The encoding type. */
{
+ Encoding *encodingPtr = (Encoding *)ckalloc(sizeof(Encoding));
+ encodingPtr->name = NULL;
+ encodingPtr->toUtfProc = typePtr->toUtfProc;
+ encodingPtr->fromUtfProc = typePtr->fromUtfProc;
+ encodingPtr->freeProc = typePtr->freeProc;
+ encodingPtr->nullSize = typePtr->nullSize;
+ encodingPtr->clientData = typePtr->clientData;
+ if (typePtr->nullSize == 2) {
+ encodingPtr->lengthProc = (LengthProc *) unilen;
+ } else if (typePtr->nullSize == 4) {
+ encodingPtr->lengthProc = (LengthProc *) unilen4;
+ } else {
+ encodingPtr->lengthProc = (LengthProc *) strlen;
+ }
+ encodingPtr->refCount = 1;
+ encodingPtr->hPtr = NULL;
+
+ if (typePtr->encodingName) {
Tcl_HashEntry *hPtr;
int isNew;
- Encoding *encodingPtr;
char *name;
Tcl_MutexLock(&encodingMutex);
@@ -1048,30 +1164,17 @@ Tcl_CreateEncoding(
* reference goes away.
*/
- encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
- encodingPtr->hPtr = NULL;
+ Encoding *replaceMe = (Encoding *)Tcl_GetHashValue(hPtr);
+ replaceMe->hPtr = NULL;
}
name = (char *)ckalloc(strlen(typePtr->encodingName) + 1);
-
- encodingPtr = (Encoding *)ckalloc(sizeof(Encoding));
encodingPtr->name = strcpy(name, typePtr->encodingName);
- encodingPtr->toUtfProc = typePtr->toUtfProc;
- encodingPtr->fromUtfProc = typePtr->fromUtfProc;
- encodingPtr->freeProc = typePtr->freeProc;
- encodingPtr->nullSize = typePtr->nullSize;
- encodingPtr->clientData = typePtr->clientData;
- if (typePtr->nullSize == 1) {
- encodingPtr->lengthProc = (LengthProc *) strlen;
- } else {
- encodingPtr->lengthProc = (LengthProc *) unilen;
- }
- encodingPtr->refCount = 1;
encodingPtr->hPtr = hPtr;
Tcl_SetHashValue(hPtr, encodingPtr);
Tcl_MutexUnlock(&encodingMutex);
-
+ }
return (Tcl_Encoding) encodingPtr;
}
@@ -1101,24 +1204,102 @@ Tcl_ExternalToUtfDString(
Tcl_Encoding encoding, /* The encoding for the source string, or NULL
* for the default system encoding. */
const char *src, /* Source string in specified encoding. */
- int srcLen, /* Source string length in bytes, or < 0 for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* encoding-specific string length. */
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
+ Tcl_ExternalToUtfDStringEx(
+ NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL);
+ return Tcl_DStringValue(dstPtr);
+}
+
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * Tcl_ExternalToUtfDStringEx --
+ *
+ * Convert a source buffer from the specified encoding into UTF-8.
+ * The parameter flags controls the behavior, if any of the bytes in
+ * the source buffer are invalid or cannot be represented in utf-8.
+ * Possible flags values:
+ * target encoding. It should be composed by OR-ing the following:
+ * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT}
+ * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile
+ * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags
+ * Any other flag bits will cause an error to be returned (for future
+ * compatibility)
+ *
+ * Results:
+ * The return value is one of
+ * TCL_OK: success. Converted string in *dstPtr
+ * TCL_ERROR: error in passed parameters. Error message in interp
+ * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
+ * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition
+ * TCL_CONVERT_UNKNOWN: source contained a character that could not
+ * be represented in target encoding.
+ *
+ * Side effects:
+ *
+ * TCL_OK: The converted bytes are stored in the DString and NUL
+ * terminated in an encoding-specific manner.
+ * TCL_ERROR: an error, message is stored in the interp if not NULL.
+ * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored
+ * in the interpreter (if not NULL). If errorLocPtr is not NULL,
+ * no error message is stored as it is expected the caller is
+ * interested in whatever is decoded so far and not treating this
+ * as an error condition.
+ *
+ * In addition, *dstPtr is always initialized and must be cleared
+ * by the caller irrespective of the return code.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+int
+Tcl_ExternalToUtfDStringEx(
+ Tcl_Interp *interp, /* For error messages. May be NULL. */
+ Tcl_Encoding encoding, /* The encoding for the source string, or NULL
+ * for the default system encoding. */
+ const char *src, /* Source string in specified encoding. */
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
+ * encoding-specific string length. */
+ int flags, /* Conversion control flags. */
+ Tcl_DString *dstPtr, /* Uninitialized or free DString in which the
+ * converted string is stored. */
+ Tcl_Size *errorLocPtr) /* Where to store the error location
+ (or TCL_INDEX_NONE if no error). May
+ be NULL. */
+{
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
- int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
+ int result, srcRead, dstWrote, dstChars;
+ Tcl_Size dstLen, soFar;
+ const char *srcStart = src;
+ /* DO FIRST - Must always be initialized before returning */
Tcl_DStringInit(dstPtr);
+
+ if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) {
+ /* TODO - what other flags are illegal? - See TIP 656 */
+ Tcl_SetObjResult(
+ interp,
+ Tcl_NewStringObj(
+ "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
+ TCL_INDEX_NONE));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
+ return TCL_ERROR;
+ }
+
dst = Tcl_DStringValue(dstPtr);
dstLen = dstPtr->spaceAvl - 1;
if (encoding == NULL) {
encoding = systemEncoding;
}
- encodingPtr = (Encoding *) encoding;
+ encodingPtr = (Encoding *)encoding;
if (src == NULL) {
srcLen = 0;
@@ -1126,20 +1307,48 @@ Tcl_ExternalToUtfDString(
srcLen = encodingPtr->lengthProc(src);
}
- flags = TCL_ENCODING_START | TCL_ENCODING_END;
+ flags |= TCL_ENCODING_START | TCL_ENCODING_END;
+ if (encodingPtr->toUtfProc == UtfToUtfProc) {
+ flags |= ENCODING_INPUT;
+ }
while (1) {
- result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
- flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars);
+ result = encodingPtr->toUtfProc(encodingPtr->clientData, src,
+ srcLen, flags, &state, dst, dstLen,
+ &srcRead, &dstWrote, &dstChars);
soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
+ src += srcRead;
if (result != TCL_CONVERT_NOSPACE) {
+ Tcl_Size nBytesProcessed = (src - srcStart);
+
Tcl_DStringSetLength(dstPtr, soFar);
- return Tcl_DStringValue(dstPtr);
+ if (errorLocPtr) {
+ /*
+ * Do not write error message into interpreter if caller
+ * wants to know error location.
+ */
+ *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed;
+ } else {
+ /* Caller wants error message on failure */
+ if (result != TCL_OK && interp != NULL) {
+ char buf[TCL_INTEGER_SPACE];
+ sprintf(buf, "%u", nBytesProcessed);
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf("unexpected byte sequence starting at index %"
+ "u: '\\x%02X'",
+ nBytesProcessed,
+ UCHAR(srcStart[nBytesProcessed])));
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL);
+ }
+ }
+ return result;
}
+ /* Expand space and continue */
flags &= ~TCL_ENCODING_START;
- src += srcRead;
srcLen -= srcRead;
if (Tcl_DStringLength(dstPtr) == 0) {
Tcl_DStringSetLength(dstPtr, dstLen);
@@ -1170,11 +1379,11 @@ Tcl_ExternalToUtfDString(
int
Tcl_ExternalToUtf(
- Tcl_Interp *interp, /* Interp for error return, if not NULL. */
+ TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */
Tcl_Encoding encoding, /* The encoding for the source string, or NULL
* for the default system encoding. */
const char *src, /* Source string in specified encoding. */
- int srcLen, /* Source string length in bytes, or < 0 for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* encoding-specific string length. */
int flags, /* Conversion control flags. */
Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
@@ -1184,7 +1393,7 @@ Tcl_ExternalToUtf(
* routine under control of flags argument. */
char *dst, /* Output buffer in which converted string is
* stored. */
- int dstLen, /* The maximum length of output buffer in
+ Tcl_Size dstLen, /* The maximum length of output buffer in
* bytes. */
int *srcReadPtr, /* Filled with the number of bytes from the
* source string that were converted. This may
@@ -1233,21 +1442,24 @@ Tcl_ExternalToUtf(
}
if (!noTerminate) {
- if (dstLen < 1) {
- return TCL_CONVERT_NOSPACE;
- }
+ if (dstLen < 1) {
+ return TCL_CONVERT_NOSPACE;
+ }
/*
* If there are any null characters in the middle of the buffer,
- * they will converted to the UTF-8 null character (\xC080). To get
+ * they will converted to the UTF-8 null character (\xC0\x80). To get
* the actual \0 at the end of the destination buffer, we need to
* append it manually. First make room for it...
*/
dstLen--;
} else {
- if (dstLen < 0) {
- return TCL_CONVERT_NOSPACE;
- }
+ if (dstLen < 0) {
+ return TCL_CONVERT_NOSPACE;
+ }
+ }
+ if (encodingPtr->toUtfProc == UtfToUtfProc) {
+ flags |= ENCODING_INPUT;
}
do {
Tcl_EncodingState savedState = *statePtr;
@@ -1258,7 +1470,7 @@ Tcl_ExternalToUtf(
if (*dstCharsPtr <= maxChars) {
break;
}
- dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
+ dstLen = TclUtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
*statePtr = savedState;
} while (1);
if (!noTerminate) {
@@ -1295,17 +1507,92 @@ Tcl_UtfToExternalDString(
Tcl_Encoding encoding, /* The encoding for the converted string, or
* NULL for the default system encoding. */
const char *src, /* Source string in UTF-8. */
- int srcLen, /* Source string length in bytes, or < 0 for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* strlen(). */
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
+ Tcl_UtfToExternalDStringEx(
+ NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL);
+ return Tcl_DStringValue(dstPtr);
+}
+
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * Tcl_UtfToExternalDStringEx --
+ *
+ * Convert a source buffer from UTF-8 to the specified encoding.
+ * The parameter flags controls the behavior, if any of the bytes in
+ * the source buffer are invalid or cannot be represented in the
+ * target encoding. It should be composed by OR-ing the following:
+ * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT}
+ * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile
+ * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags
+ *
+ * Results:
+ * The return value is one of
+ * TCL_OK: success. Converted string in *dstPtr
+ * TCL_ERROR: error in passed parameters. Error message in interp
+ * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
+ * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition
+ * TCL_CONVERT_UNKNOWN: source contained a character that could not
+ * be represented in target encoding.
+ *
+ * Side effects:
+ *
+ * TCL_OK: The converted bytes are stored in the DString and NUL
+ * terminated in an encoding-specific manner
+ * TCL_ERROR: an error, message is stored in the interp if not NULL.
+ * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored
+ * in the interpreter (if not NULL). If errorLocPtr is not NULL,
+ * no error message is stored as it is expected the caller is
+ * interested in whatever is decoded so far and not treating this
+ * as an error condition.
+ *
+ * In addition, *dstPtr is always initialized and must be cleared
+ * by the caller irrespective of the return code.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfToExternalDStringEx(
+ Tcl_Interp *interp, /* For error messages. May be NULL. */
+ Tcl_Encoding encoding, /* The encoding for the converted string, or
+ * NULL for the default system encoding. */
+ const char *src, /* Source string in UTF-8. */
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
+ * strlen(). */
+ int flags, /* Conversion control flags. */
+ Tcl_DString *dstPtr, /* Uninitialized or free DString in which the
+ * converted string is stored. */
+ Tcl_Size *errorLocPtr) /* Where to store the error location
+ (or TCL_INDEX_NONE if no error). May
+ be NULL. */
+{
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
- int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
+ int result, srcRead, dstWrote, dstChars;
+ const char *srcStart = src;
+ Tcl_Size dstLen, soFar;
+ /* DO FIRST - must always be initialized on return */
Tcl_DStringInit(dstPtr);
+
+ if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) {
+ /* TODO - what other flags are illegal? - See TIP 656 */
+ Tcl_SetObjResult(
+ interp,
+ Tcl_NewStringObj(
+ "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
+ TCL_INDEX_NONE));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
+ return TCL_ERROR;
+ }
+
dst = Tcl_DStringValue(dstPtr);
dstLen = dstPtr->spaceAvl - 1;
@@ -1319,23 +1606,49 @@ Tcl_UtfToExternalDString(
} else if (srcLen < 0) {
srcLen = strlen(src);
}
- flags = TCL_ENCODING_START | TCL_ENCODING_END;
+
+ flags |= TCL_ENCODING_START | TCL_ENCODING_END;
while (1) {
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
- srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
- &dstChars);
+ srcLen, flags, &state, dst, dstLen,
+ &srcRead, &dstWrote, &dstChars);
soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
+ src += srcRead;
if (result != TCL_CONVERT_NOSPACE) {
- if (encodingPtr->nullSize == 2) {
- Tcl_DStringSetLength(dstPtr, soFar + 1);
+ Tcl_Size nBytesProcessed = (src - srcStart);
+ int i = soFar + encodingPtr->nullSize - 1;
+ while (i >= soFar) {
+ Tcl_DStringSetLength(dstPtr, i--);
}
- Tcl_DStringSetLength(dstPtr, soFar);
- return Tcl_DStringValue(dstPtr);
+ if (errorLocPtr) {
+ /*
+ * Do not write error message into interpreter if caller
+ * wants to know error location.
+ */
+ *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed;
+ } else {
+ /* Caller wants error message on failure */
+ if (result != TCL_OK && interp != NULL) {
+ int pos = Tcl_NumUtfChars(srcStart, nBytesProcessed);
+ int ucs4;
+ char buf[TCL_INTEGER_SPACE];
+ TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4);
+ sprintf(buf, "%u", nBytesProcessed);
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf(
+ "unexpected character at index %d: 'U+%06X'",
+ pos,
+ ucs4));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE",
+ buf, NULL);
+ }
+ }
+ return result;
}
flags &= ~TCL_ENCODING_START;
- src += srcRead;
srcLen -= srcRead;
if (Tcl_DStringLength(dstPtr) == 0) {
Tcl_DStringSetLength(dstPtr, dstLen);
@@ -1366,11 +1679,11 @@ Tcl_UtfToExternalDString(
int
Tcl_UtfToExternal(
- Tcl_Interp *interp, /* Interp for error return, if not NULL. */
+ TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */
Tcl_Encoding encoding, /* The encoding for the converted string, or
* NULL for the default system encoding. */
const char *src, /* Source string in UTF-8. */
- int srcLen, /* Source string length in bytes, or < 0 for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* strlen(). */
int flags, /* Conversion control flags. */
Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
@@ -1380,7 +1693,7 @@ Tcl_UtfToExternal(
* routine under control of flags argument. */
char *dst, /* Output buffer in which converted string
* is stored. */
- int dstLen, /* The maximum length of output buffer in
+ Tcl_Size dstLen, /* The maximum length of output buffer in
* bytes. */
int *srcReadPtr, /* Filled with the number of bytes from the
* source string that were converted. This may
@@ -1423,20 +1736,17 @@ Tcl_UtfToExternal(
}
if (dstLen < encodingPtr->nullSize) {
- return TCL_CONVERT_NOSPACE;
+ return TCL_CONVERT_NOSPACE;
}
dstLen -= encodingPtr->nullSize;
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
- flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
- dstCharsPtr);
+ flags, statePtr, dst, dstLen, srcReadPtr,
+ dstWrotePtr, dstCharsPtr);
/*
* Buffer is terminated irrespective of result. Not sure this is
* reasonable but keep for historical/compatibility reasons.
*/
- if (encodingPtr->nullSize == 2) {
- dst[*dstWrotePtr + 1] = '\0';
- }
- dst[*dstWrotePtr] = '\0';
+ memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize);
return result;
}
@@ -1459,14 +1769,15 @@ Tcl_UtfToExternal(
*---------------------------------------------------------------------------
*/
#undef Tcl_FindExecutable
-void
+const char *
Tcl_FindExecutable(
const char *argv0) /* The value of the application's argv[0]
* (native). */
{
- TclInitSubsystems();
+ const char *version = Tcl_InitSubsystems();
TclpSetInitialEncodings();
TclpFindExecutable(argv0);
+ return version;
}
/*
@@ -1494,17 +1805,17 @@ OpenEncodingFileChannel(
const char *name) /* The name of the encoding file on disk and
* also the name for new encoding. */
{
- Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1);
+ Tcl_Obj *nameObj = Tcl_NewStringObj(name, TCL_INDEX_NONE);
Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj);
Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath());
Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap);
Tcl_Obj **dir, *path, *directory = NULL;
Tcl_Channel chan = NULL;
- int i, numDirs;
+ Tcl_Size i, numDirs;
- TclListObjGetElements(NULL, searchPath, &numDirs, &dir);
+ TclListObjGetElementsM(NULL, searchPath, &numDirs, &dir);
Tcl_IncrRefCount(nameObj);
- Tcl_AppendToObj(fileNameObj, ".enc", -1);
+ Tcl_AppendToObj(fileNameObj, ".enc", TCL_INDEX_NONE);
Tcl_IncrRefCount(fileNameObj);
Tcl_DictObjGet(NULL, map, nameObj, &directory);
@@ -1723,7 +2034,7 @@ LoadTableEncoding(
};
Tcl_DStringInit(&lineString);
- if (Tcl_Gets(chan, &lineString) == -1) {
+ if (Tcl_Gets(chan, &lineString) == TCL_IO_FAILURE) {
return NULL;
}
line = Tcl_DStringValue(&lineString);
@@ -1765,7 +2076,7 @@ LoadTableEncoding(
for (i = 0; i < numPages; i++) {
int ch;
const char *p;
- int expected = 3 + 16 * (16 * 4 + 1);
+ Tcl_Size expected = 3 + 16 * (16 * 4 + 1);
if (Tcl_ReadChars(chan, objPtr, expected, 0) != expected) {
return NULL;
@@ -1849,8 +2160,8 @@ LoadTableEncoding(
*/
if (dataPtr->fromUnicode[0] != NULL) {
- if (dataPtr->fromUnicode[0]['\\'] == '\0') {
- dataPtr->fromUnicode[0]['\\'] = '\\';
+ if (dataPtr->fromUnicode[0][(int)'\\'] == '\0') {
+ dataPtr->fromUnicode[0][(int)'\\'] = '\\';
}
}
}
@@ -2001,7 +2312,7 @@ LoadEscapeEncoding(
Tcl_DStringInit(&escapeData);
while (1) {
- int argc;
+ Tcl_Size argc;
const char **argv;
char *line;
Tcl_DString lineString;
@@ -2053,7 +2364,7 @@ LoadEscapeEncoding(
Tcl_DStringFree(&lineString);
}
- size = TclOffset(EscapeEncodingData, subTables)
+ size = offsetof(EscapeEncodingData, subTables)
+ Tcl_DStringLength(&escapeData);
dataPtr = (EscapeEncodingData *)ckalloc(size);
dataPtr->initLen = strlen(init);
@@ -2111,15 +2422,11 @@ LoadEscapeEncoding(
static int
BinaryProc(
- ClientData clientData, /* Not used. */
+ TCL_UNUSED(void *),
const char *src, /* Source string (unknown encoding). */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2140,6 +2447,7 @@ BinaryProc(
if (dstLen < 0) {
dstLen = 0;
}
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) {
srcLen = *dstCharsPtr;
}
@@ -2158,11 +2466,11 @@ BinaryProc(
/*
*-------------------------------------------------------------------------
*
- * UtfIntToUtfExtProc --
+ * UtfToUtfProc --
*
- * Convert from UTF-8 to UTF-8. While converting null-bytes from the
- * Tcl's internal representation (0xC0, 0x80) to the official
- * representation (0x00). See UtfToUtfProc for details.
+ * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
+ * is not a no-op, because it turns a stream of improperly formed
+ * UTF-8 into a properly-formed stream.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2174,18 +2482,14 @@ BinaryProc(
*/
static int
-UtfIntToUtfExtProc(
- ClientData clientData, /* Not used. */
+UtfToUtfProc(
+ void *clientData, /* additional flags */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
- int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
- char *dst, /* Output buffer in which converted string
- * is stored. */
+ int flags, /* TCL_ENCODING_* conversion control flags. */
+ TCL_UNUSED(Tcl_EncodingState *),
+ char *dst, /* Output buffer in which converted string is
+ * stored. */
int dstLen, /* The maximum length of output buffer in
* bytes. */
int *srcReadPtr, /* Filled with the number of bytes from the
@@ -2200,18 +2504,195 @@ UtfIntToUtfExtProc(
* correspond to the bytes stored in the
* output buffer. */
{
- return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
-}
+ const char *srcStart, *srcEnd, *srcClose;
+ const char *dstStart, *dstEnd;
+ int result, numChars, charLimit = INT_MAX;
+ int ch;
+ int profile;
+
+ result = TCL_OK;
+
+ srcStart = src;
+ srcEnd = src + srcLen;
+ srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
+ if ((flags & TCL_ENCODING_END) == 0) {
+ srcClose -= 6;
+ }
+ if (flags & TCL_ENCODING_CHAR_LIMIT) {
+ charLimit = *dstCharsPtr;
+ }
+
+ dstStart = dst;
+ flags |= PTR2INT(clientData);
+ dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6);
+
+ profile = CHANNEL_PROFILE_GET(flags);
+ for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+
+ if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+ /*
+ * If there is more string to follow, this will ensure that the
+ * last UTF-8 character in the source buffer hasn't been cut off.
+ */
+
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ break;
+ }
+ if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) {
+ /*
+ * Copy 7bit characters, but skip null-bytes when we are in input
+ * mode, so that they get converted to \xC0\x80.
+ */
+ *dst++ = *src++;
+ } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
+ (UCHAR(src[1]) == 0x80) &&
+ (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) ||
+ PROFILE_REPLACE(profile))) {
+ /* Special sequence \xC0\x80 */
+ if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) {
+ if (PROFILE_REPLACE(profile)) {
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ src += 2;
+ } else {
+ /* PROFILE_STRICT */
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+ } else {
+ /*
+ * Convert 0xC080 to real nulls when we are in output mode,
+ * irrespective of the profile.
+ */
+ *dst++ = 0;
+ src += 2;
+ }
+
+ } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
+ /*
+ * Incomplete byte sequence.
+ * Always check before using TclUtfToUCS4. Not doing can so
+ * cause it run beyond the end of the buffer! If we happen such an
+ * incomplete char its bytes are made to represent themselves
+ * unless the user has explicitly asked to be told.
+ */
+
+ if (flags & ENCODING_INPUT) {
+ /* Incomplete bytes for modified UTF-8 target */
+ if (PROFILE_STRICT(profile)) {
+ result = (flags & TCL_ENCODING_CHAR_LIMIT)
+ ? TCL_CONVERT_MULTIBYTE
+ : TCL_CONVERT_SYNTAX;
+ break;
+ }
+ }
+ if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ ++src;
+ } else {
+ /* TCL_ENCODING_PROFILE_TCL8 */
+ char chbuf[2];
+ chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
+ TclUtfToUCS4(chbuf, &ch);
+ }
+ dst += Tcl_UniCharToUtf(ch, dst);
+ } else {
+ int low;
+ int isInvalid = 0;
+ size_t len = TclUtfToUCS4(src, &ch);
+ if (flags & ENCODING_INPUT) {
+ if ((len < 2) && (ch != 0)) {
+ isInvalid = 1;
+ } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
+ isInvalid = 1;
+ }
+ if (isInvalid) {
+ if (PROFILE_STRICT(profile)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ } else if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
+ }
+ }
+ const char *saveSrc = src;
+ src += len;
+ if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
+ if (ch > 0xFFFF) {
+ /* CESU-8 6-byte sequence for chars > U+FFFF */
+ ch -= 0x10000;
+ *dst++ = 0xED;
+ *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
+ *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
+ ch = (ch & 0x0CFF) | 0xDC00;
+ }
+ goto cesu8;
+ } else if ((ch | 0x7FF) == 0xDFFF) {
+ /*
+ * A surrogate character is detected, handle especially.
+ */
+ if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ low = ch;
+ len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0;
+
+ if ((!LOW_SURROGATE(low)) || (ch & 0x400)) {
+
+ if (PROFILE_STRICT(profile)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+cesu8:
+ *dst++ = (char)(((ch >> 12) | 0xE0) & 0xEF);
+ *dst++ = (char)(((ch >> 6) | 0x80) & 0xBF);
+ *dst++ = (char)((ch | 0x80) & 0xBF);
+ continue;
+ }
+ src += len;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ ch = low;
+ }
+ } else if (PROFILE_STRICT(profile) &&
+ (!(flags & ENCODING_INPUT)) &&
+ SURROGATE(ch)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ } else if (PROFILE_STRICT(profile) &&
+ (flags & ENCODING_INPUT) &&
+ SURROGATE(ch)) {
+ result = TCL_CONVERT_SYNTAX;
+ src = saveSrc;
+ break;
+ }
+ dst += Tcl_UniCharToUtf(ch, dst);
+ }
+ }
+
+ *srcReadPtr = src - srcStart;
+ *dstWrotePtr = dst - dstStart;
+ *dstCharsPtr = numChars;
+ return result;
+}
+
/*
*-------------------------------------------------------------------------
*
- * UtfExtToUtfIntProc --
+ * Utf32ToUtfProc --
*
- * Convert from UTF-8 to UTF-8 while converting null-bytes from the
- * official representation (0x00) to Tcl's internal representation (0xC0,
- * 0x80). See UtfToUtfProc for details.
+ * Convert from UTF-32 to UTF-8.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2223,16 +2704,12 @@ UtfIntToUtfExtProc(
*/
static int
-UtfExtToUtfIntProc(
- ClientData clientData, /* Not used. */
- const char *src, /* Source string in UTF-8. */
+Utf32ToUtfProc(
+ void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */
+ const char *src, /* Source string in Unicode. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2249,18 +2726,128 @@ UtfExtToUtfIntProc(
* correspond to the bytes stored in the
* output buffer. */
{
- return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
-}
+ const char *srcStart, *srcEnd;
+ const char *dstEnd, *dstStart;
+ int result, numChars, charLimit = INT_MAX;
+ int ch = 0, bytesLeft = srcLen % 4;
+
+ flags = TclEncodingSetProfileFlags(flags);
+ flags |= PTR2INT(clientData);
+ if (flags & TCL_ENCODING_CHAR_LIMIT) {
+ charLimit = *dstCharsPtr;
+ }
+ result = TCL_OK;
+
+ /*
+ * Check alignment with utf-32 (4 == sizeof(UTF-32))
+ */
+ if (bytesLeft != 0) {
+ /* We have a truncated code unit */
+ result = TCL_CONVERT_MULTIBYTE;
+ srcLen -= bytesLeft;
+ }
+
+ /*
+ * If last code point is a high surrogate, we cannot handle that yet,
+ * unless we are at the end.
+ */
+
+ if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) {
+ result = TCL_CONVERT_MULTIBYTE;
+ srcLen-= 4;
+ }
+
+ srcStart = src;
+ srcEnd = src + srcLen;
+
+ dstStart = dst;
+ dstEnd = dst + dstLen - TCL_UTF_MAX;
+
+ for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ break;
+ }
+
+ int prev = ch;
+ if (flags & TCL_ENCODING_LE) {
+ ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
+ } else {
+ ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
+ }
+ if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
+ /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+
+ if ((unsigned)ch > 0x10FFFF) {
+ ch = UNICODE_REPLACE_CHAR;
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+ } else if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
+ result = TCL_CONVERT_SYNTAX;
+ ch = 0;
+ break;
+ } else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
+
+ /*
+ * Special case for 1-byte utf chars for speed. Make sure we work with
+ * unsigned short-size data.
+ */
+
+ if ((unsigned)ch - 1 < 0x7F) {
+ *dst++ = (ch & 0xFF);
+ } else {
+ if (!HIGH_SURROGATE(prev) && LOW_SURROGATE(ch)) {
+ *dst = 0; /* In case of lower surrogate, don't try to combine */
+ }
+ dst += Tcl_UniCharToUtf(ch, dst);
+ }
+ src += 4;
+ }
+
+ if (HIGH_SURROGATE(ch)) {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+
+ if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
+ /* We have a code fragment left-over at the end */
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ } else {
+ /* destination is not full, so we really are at the end now */
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ } else {
+ /* PROFILE_REPLACE or PROFILE_TCL8 */
+ result = TCL_OK;
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ numChars++;
+ src += bytesLeft; /* Go past truncated code unit */
+ }
+ }
+ }
+ *srcReadPtr = src - srcStart;
+ *dstWrotePtr = dst - dstStart;
+ *dstCharsPtr = numChars;
+ return result;
+}
+
/*
*-------------------------------------------------------------------------
*
- * UtfToUtfProc --
+ * UtfToUtf32Proc --
*
- * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
- * is not a no-op, because it turns a stream of improperly formed
- * UTF-8 into a properly-formed stream.
+ * Convert from UTF-8 to UTF-32.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2272,16 +2859,12 @@ UtfExtToUtfIntProc(
*/
static int
-UtfToUtfProc(
- ClientData clientData, /* Not used. */
+UtfToUtf32Proc(
+ void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2294,37 +2877,28 @@ UtfToUtfProc(
int *dstWrotePtr, /* Filled with the number of bytes that were
* stored in the output buffer as a result of
* the conversion. */
- int *dstCharsPtr, /* Filled with the number of characters that
+ int *dstCharsPtr) /* Filled with the number of characters that
* correspond to the bytes stored in the
* output buffer. */
- int pureNullMode) /* Convert embedded nulls from internal
- * representation to real null-bytes or vice
- * versa. Also combine or separate surrogate pairs */
{
- const char *srcStart, *srcEnd, *srcClose;
- const char *dstStart, *dstEnd;
- int result, numChars, charLimit = INT_MAX;
- Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
-
- if (flags & TCL_ENCODING_START) {
- *statePtr = 0;
- }
- result = TCL_OK;
+ const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
+ int result, numChars;
+ int ch, len;
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
- srcClose -= 6;
- }
- if (flags & TCL_ENCODING_CHAR_LIMIT) {
- charLimit = *dstCharsPtr;
+ srcClose -= TCL_UTF_MAX;
}
dstStart = dst;
- dstEnd = dst + dstLen - ((pureNullMode == 1) ? 4 : TCL_UTF_MAX);
+ dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
+ flags |= PTR2INT(clientData);
- for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+ result = TCL_OK;
+ for (numChars = 0; src < srcEnd; numChars++) {
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
/*
* If there is more string to follow, this will ensure that the
@@ -2338,77 +2912,27 @@ UtfToUtfProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (pureNullMode == 0))) {
- /*
- * Copy 7bit characters, but skip null-bytes when we are in input
- * mode, so that they get converted to 0xC080.
- */
-
- *dst++ = *src++;
- *chPtr = 0; /* reset surrogate handling */
- } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd)
- && (UCHAR(src[1]) == 0x80) && (pureNullMode == 1)) {
- /*
- * Convert 0xC080 to real nulls when we are in output mode.
- */
-
- *dst++ = 0;
- *chPtr = 0; /* reset surrogate handling */
- src += 2;
- } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
- /*
- * Always check before using TclUtfToUniChar. Not doing can so
- * cause it run beyond the end of the buffer! If we happen such an
- * incomplete char its bytes are made to represent themselves
- * unless the user has explicitly asked to be told.
- */
-
- if ((flags & TCL_ENCODING_STOPONERROR) && (pureNullMode == 0)) {
- result = TCL_CONVERT_MULTIBYTE;
+ len = TclUtfToUCS4(src, &ch);
+ if (SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
break;
}
- *chPtr = UCHAR(*src);
- src += 1;
- dst += Tcl_UniCharToUtf(*chPtr, dst);
- } else {
- size_t len = TclUtfToUniChar(src, chPtr);
- if ((len < 2) && (*chPtr != 0) && (flags & TCL_ENCODING_STOPONERROR)
- && ((*chPtr & ~0x7FF) != 0xD800) && (pureNullMode == 0)) {
- result = TCL_CONVERT_SYNTAX;
- break;
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
}
- src += len;
- if ((*chPtr & ~0x7FF) == 0xD800) {
- Tcl_UniChar low;
- /* A surrogate character is detected, handle especially */
-#if TCL_UTF_MAX <= 4
- if ((len < 3) && ((src[3 - len] & 0xC0) != 0x80)) {
- /* It's invalid. See [ed29806ba] */
- *chPtr = UCHAR(src[-1]);
- dst += Tcl_UniCharToUtf(*chPtr, dst);
- continue;
- }
-#endif
- low = *chPtr;
- len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0;
- if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
- *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
- *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
- *dst++ = (char) ((*chPtr | 0x80) & 0xBF);
- *chPtr = 0; /* reset surrogate handling */
- continue;
- } else if ((TCL_UTF_MAX > 3) || (pureNullMode == 1)) {
- int full = (((*chPtr & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
- *dst++ = (char) (((full >> 18) | 0xF0) & 0xF7);
- *dst++ = (char) (((full >> 12) | 0x80) & 0xBF);
- *dst++ = (char) (((full >> 6) | 0x80) & 0xBF);
- *dst++ = (char) ((full | 0x80) & 0xBF);
- *chPtr = 0; /* reset surrogate handling */
- src += len;
- continue;
- }
- }
- dst += Tcl_UniCharToUtf(*chPtr, dst);
+ }
+ src += len;
+ if (flags & TCL_ENCODING_LE) {
+ *dst++ = (ch & 0xFF);
+ *dst++ = ((ch >> 8) & 0xFF);
+ *dst++ = ((ch >> 16) & 0xFF);
+ *dst++ = ((ch >> 24) & 0xFF);
+ } else {
+ *dst++ = ((ch >> 24) & 0xFF);
+ *dst++ = ((ch >> 16) & 0xFF);
+ *dst++ = ((ch >> 8) & 0xFF);
+ *dst++ = (ch & 0xFF);
}
}
@@ -2421,7 +2945,7 @@ UtfToUtfProc(
/*
*-------------------------------------------------------------------------
*
- * UnicodeToUtfProc --
+ * Utf16ToUtfProc --
*
* Convert from UTF-16 to UTF-8.
*
@@ -2435,16 +2959,12 @@ UtfToUtfProc(
*/
static int
-UnicodeToUtfProc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+Utf16ToUtfProc(
+ void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in Unicode. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2464,8 +2984,10 @@ UnicodeToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
- unsigned short ch;
+ unsigned short ch = 0;
+ flags = TclEncodingSetProfileFlags(flags);
+ flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -2480,18 +3002,16 @@ UnicodeToUtfProc(
srcLen--;
}
-#if TCL_UTF_MAX > 3
/*
* If last code point is a high surrogate, we cannot handle that yet,
* unless we are at the end.
*/
if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) &&
- ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) {
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
result = TCL_CONVERT_MULTIBYTE;
srcLen-= 2;
}
-#endif
srcStart = src;
srcEnd = src + srcLen;
@@ -2505,37 +3025,76 @@ UnicodeToUtfProc(
break;
}
- if (clientData) {
+ unsigned short prev = ch;
+ if (flags & TCL_ENCODING_LE) {
ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
}
+ if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ src -= 2; /* Go back to beginning of high surrogate */
+ dst--; /* Also undo writing a single byte too much */
+ numChars--;
+ break;
+ }
+ /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
/*
* Special case for 1-byte utf chars for speed. Make sure we work with
* unsigned short-size data.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (ch & 0xFF);
+ } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
+ dst += Tcl_UniCharToUtf(ch, dst);
+ } else if (LOW_SURROGATE(ch) && PROFILE_STRICT(flags)) {
+ /* Lo surrogate not preceded by Hi surrogate */
+ result = TCL_CONVERT_SYNTAX;
+ break;
} else {
+ *dst = 0; /* In case of lower surrogate, don't try to combine */
dst += Tcl_UniCharToUtf(ch, dst);
}
src += sizeof(unsigned short);
}
+ if (HIGH_SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ src -= 2;
+ dst--;
+ numChars--;
+ } else {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+ }
+
+ /*
+ * If we had a truncated code unit at the end AND this is the last
+ * fragment AND profile is not "strict", stick FFFD in its place.
+ */
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
- /* We have a single byte left-over at the end */
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
} else {
- /* destination is not full, so we really are at the end now */
- result = TCL_OK;
- dst += Tcl_UniCharToUtf(0xFFFD, dst);
- numChars++;
- src++;
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ } else {
+ /* PROFILE_REPLACE or PROFILE_TCL8 */
+ result = TCL_OK;
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ numChars++;
+ src++; /* Go past truncated code unit */
+ }
}
}
+
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
@@ -2545,7 +3104,7 @@ UnicodeToUtfProc(
/*
*-------------------------------------------------------------------------
*
- * UtfToUnicodeProc --
+ * UtfToUtf16Proc --
*
* Convert from UTF-8 to UTF-16.
*
@@ -2559,16 +3118,12 @@ UnicodeToUtfProc(
*/
static int
-UtfToUnicodeProc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+UtfToUtf16Proc(
+ void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2587,20 +3142,19 @@ UtfToUnicodeProc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ int ch, len;
- if (flags & TCL_ENCODING_START) {
- *statePtr = 0;
- }
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
dstStart = dst;
dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
+ flags |= PTR2INT(clientData);
result = TCL_OK;
for (numChars = 0; src < srcEnd; numChars++) {
@@ -2617,38 +3171,154 @@ UtfToUnicodeProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- src += TclUtfToUniChar(src, chPtr);
-
- if (clientData) {
-#if TCL_UTF_MAX > 4
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
+ len = TclUtfToUCS4(src, &ch);
+ if (SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ }
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
+ }
+ src += len;
+ if (flags & TCL_ENCODING_LE) {
+ if (ch <= 0xFFFF) {
+ *dst++ = (ch & 0xFF);
+ *dst++ = (ch >> 8);
} else {
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (*chPtr & 0xFF);
- *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
+ *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+ *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (ch & 0xFF);
+ *dst++ = ((ch >> 8) & 0x3) | 0xDC;
}
-#else
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
-#endif
} else {
-#if TCL_UTF_MAX > 4
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
+ if (ch <= 0xFFFF) {
+ *dst++ = (ch >> 8);
+ *dst++ = (ch & 0xFF);
} else {
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
- *dst++ = (*chPtr & 0xFF);
+ *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+ *dst++ = ((ch >> 8) & 0x3) | 0xDC;
+ *dst++ = (ch & 0xFF);
+ }
+ }
+ }
+ *srcReadPtr = src - srcStart;
+ *dstWrotePtr = dst - dstStart;
+ *dstCharsPtr = numChars;
+ return result;
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfToUcs2Proc --
+ *
+ * Convert from UTF-8 to UCS-2.
+ *
+ * Results:
+ * Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static int
+UtfToUcs2Proc(
+ void *clientData, /* additional flags, e.g. TCL_ENCODING_LE */
+ const char *src, /* Source string in UTF-8. */
+ int srcLen, /* Source string length in bytes. */
+ int flags, /* Conversion control flags. */
+ TCL_UNUSED(Tcl_EncodingState *),
+ char *dst, /* Output buffer in which converted string is
+ * stored. */
+ int dstLen, /* The maximum length of output buffer in
+ * bytes. */
+ int *srcReadPtr, /* Filled with the number of bytes from the
+ * source string that were converted. This may
+ * be less than the original source length if
+ * there was a problem converting some source
+ * characters. */
+ int *dstWrotePtr, /* Filled with the number of bytes that were
+ * stored in the output buffer as a result of
+ * the conversion. */
+ int *dstCharsPtr) /* Filled with the number of characters that
+ * correspond to the bytes stored in the
+ * output buffer. */
+{
+ const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
+ int result, numChars, len;
+ Tcl_UniChar ch = 0;
+
+ flags = TclEncodingSetProfileFlags(flags);
+ flags |= PTR2INT(clientData);
+ srcStart = src;
+ srcEnd = src + srcLen;
+ srcClose = srcEnd;
+ if ((flags & TCL_ENCODING_END) == 0) {
+ srcClose -= TCL_UTF_MAX;
+ }
+
+ dstStart = dst;
+ dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
+
+ result = TCL_OK;
+ for (numChars = 0; src < srcEnd; numChars++) {
+ if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+ /*
+ * If there is more string to follow, this will ensure that the
+ * last UTF-8 character in the source buffer hasn't been cut off.
+ */
+
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ break;
+ }
+#if TCL_UTF_MAX < 4
+ len = TclUtfToUniChar(src, &ch);
+ if ((ch >= 0xD800) && (len < 3)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
}
+ src += len;
+ src += TclUtfToUniChar(src, &ch);
+ ch = UNICODE_REPLACE_CHAR;
+ }
#else
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
+ len = TclUtfToUniChar(src, &ch);
+ if (ch > 0xFFFF) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ }
+ ch = UNICODE_REPLACE_CHAR;
+ }
#endif
+ if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+
+ src += len;
+
+ /*
+ * Need to handle this in a way that won't cause misalignment by
+ * casting dst to a Tcl_UniChar. [Bug 1122671]
+ */
+
+ if (flags & TCL_ENCODING_LE) {
+ *dst++ = (ch & 0xFF);
+ *dst++ = (ch >> 8);
+ } else {
+ *dst++ = (ch >> 8);
+ *dst++ = (ch & 0xFF);
}
}
*srcReadPtr = src - srcStart;
@@ -2676,16 +3346,12 @@ UtfToUnicodeProc(
static int
TableToUtfProc(
- ClientData clientData, /* TableEncodingData that specifies
+ void *clientData, /* TableEncodingData that specifies
* encoding. */
const char *src, /* Source string in specified encoding. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2710,6 +3376,7 @@ TableToUtfProc(
const unsigned short *pageZero;
TableEncodingData *dataPtr = (TableEncodingData *)clientData;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -2733,30 +3400,47 @@ TableToUtfProc(
if (prefixBytes[byte]) {
src++;
if (src >= srcEnd) {
- src--;
- result = TCL_CONVERT_MULTIBYTE;
- break;
+ if (!(flags & TCL_ENCODING_END)) {
+ src--;
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ } else if (PROFILE_STRICT(flags)) {
+ src--;
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ } else if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ src--; /* See bug [bdcb5126c0] */
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ } else {
+ ch = toUnicode[byte][*((unsigned char *)src)];
}
- ch = toUnicode[byte][*((unsigned char *) src)];
} else {
ch = pageZero[byte];
}
if ((ch == 0) && (byte != 0)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
break;
}
if (prefixBytes[byte]) {
src--;
}
- ch = (Tcl_UniChar) byte;
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ ch = (Tcl_UniChar)byte;
+ }
}
/*
* Special case for 1-byte utf chars for speed.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (char) ch;
} else {
dst += Tcl_UniCharToUtf(ch, dst);
@@ -2789,16 +3473,12 @@ TableToUtfProc(
static int
TableFromUtfProc(
- ClientData clientData, /* TableEncodingData that specifies
+ void *clientData, /* TableEncodingData that specifies
* encoding. */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2830,6 +3510,7 @@ TableFromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -2849,12 +3530,12 @@ TableFromUtfProc(
}
len = TclUtfToUniChar(src, &ch);
-#if TCL_UTF_MAX > 4
+#if TCL_UTF_MAX > 3
/* Unicode chars > +U0FFFF cannot be represented in any table encoding */
if (ch & 0xFFFF0000) {
word = 0;
} else
-#elif TCL_UTF_MAX == 4
+#else
if (!len) {
word = 0;
} else
@@ -2862,11 +3543,11 @@ TableFromUtfProc(
word = fromUnicode[(ch >> 8)][ch & 0xFF];
if ((word == 0) && (ch != 0)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
- word = dataPtr->fallback;
+ word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */
}
if (prefixBytes[(word >> 8)] != 0) {
if (dst + 1 > dstEnd) {
@@ -2911,15 +3592,11 @@ TableFromUtfProc(
static int
Iso88591ToUtfProc(
- ClientData clientData, /* Ignored. */
+ TCL_UNUSED(void *),
const char *src, /* Source string in specified encoding. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2940,6 +3617,7 @@ Iso88591ToUtfProc(
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -2963,7 +3641,7 @@ Iso88591ToUtfProc(
* Special case for 1-byte utf chars for speed.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (char) ch;
} else {
dst += Tcl_UniCharToUtf(ch, dst);
@@ -2995,15 +3673,11 @@ Iso88591ToUtfProc(
static int
Iso88591FromUtfProc(
- ClientData clientData, /* Ignored. */
+ TCL_UNUSED(void *),
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -3028,6 +3702,7 @@ Iso88591FromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3054,15 +3729,15 @@ Iso88591FromUtfProc(
*/
if (ch > 0xFF
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX < 4
|| ((ch >= 0xD800) && (len < 3))
#endif
) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX < 4
if ((ch >= 0xD800) && (len < 3)) {
len = 4;
}
@@ -3071,7 +3746,7 @@ Iso88591FromUtfProc(
* Plunge on, using '?' as a fallback character.
*/
- ch = (Tcl_UniChar) '?';
+ ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */
}
if (dst > dstEnd) {
@@ -3107,10 +3782,10 @@ Iso88591FromUtfProc(
static void
TableFreeProc(
- ClientData clientData) /* TableEncodingData that specifies
+ void *clientData) /* TableEncodingData that specifies
* encoding. */
{
- TableEncodingData *dataPtr = (TableEncodingData *) clientData;
+ TableEncodingData *dataPtr = (TableEncodingData *)clientData;
/*
* Make sure we aren't freeing twice on shutdown. [Bug 219314]
@@ -3142,7 +3817,7 @@ TableFreeProc(
static int
EscapeToUtfProc(
- ClientData clientData, /* EscapeEncodingData that specifies
+ void *clientData, /* EscapeEncodingData that specifies
* encoding. */
const char *src, /* Source string in specified encoding. */
int srcLen, /* Source string length in bytes. */
@@ -3168,13 +3843,14 @@ EscapeToUtfProc(
* correspond to the bytes stored in the
* output buffer. */
{
- EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData;
+ EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd;
const unsigned short *const *tableToUnicode;
const Encoding *encodingPtr;
int state, result, numChars, charLimit = INT_MAX;
const char *dstStart, *dstEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -3285,9 +3961,10 @@ EscapeToUtfProc(
if ((checked == dataPtr->numSubTables + 2)
|| (flags & TCL_ENCODING_END)) {
- if ((flags & TCL_ENCODING_STOPONERROR) == 0) {
+ if (!PROFILE_STRICT(flags)) {
/*
- * Skip the unknown escape sequence.
+ * Skip the unknown escape sequence. TODO - bug?
+ * May be replace with UNICODE_REPLACE_CHAR?
*/
src += longest;
@@ -3356,7 +4033,7 @@ EscapeToUtfProc(
static int
EscapeFromUtfProc(
- ClientData clientData, /* EscapeEncodingData that specifies
+ void *clientData, /* EscapeEncodingData that specifies
* encoding. */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
@@ -3397,6 +4074,7 @@ EscapeFromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3460,7 +4138,7 @@ EscapeFromUtfProc(
if (word == 0) {
state = oldState;
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -3493,8 +4171,7 @@ EscapeFromUtfProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- memcpy(dst, subTablePtr->sequence,
- subTablePtr->sequenceLen);
+ memcpy(dst, subTablePtr->sequence, subTablePtr->sequenceLen);
dst += subTablePtr->sequenceLen;
}
}
@@ -3568,7 +4245,7 @@ EscapeFromUtfProc(
static void
EscapeFreeProc(
- ClientData clientData) /* EscapeEncodingData that specifies
+ void *clientData) /* EscapeEncodingData that specifies
* encoding. */
{
EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
@@ -3645,7 +4322,7 @@ GetTableEncoding(
/*
*---------------------------------------------------------------------------
*
- * unilen --
+ * unilen, unilen4 --
*
* A helper function for the Tcl_ExternalToUtf functions. This function
* is similar to strlen for double-byte characters: it returns the number
@@ -3672,6 +4349,19 @@ unilen(
}
return (char *) p - src;
}
+
+static size_t
+unilen4(
+ const char *src)
+{
+ unsigned int *p;
+
+ p = (unsigned int *) src;
+ while (*p != 0x00000000) {
+ p++;
+ }
+ return (char *) p - src;
+}
/*
*-------------------------------------------------------------------------
@@ -3699,11 +4389,11 @@ unilen(
static void
InitializeEncodingSearchPath(
char **valuePtr,
- int *lengthPtr,
+ TCL_HASH_TYPE *lengthPtr,
Tcl_Encoding *encodingPtr)
{
const char *bytes;
- int i, numDirs, numBytes;
+ Tcl_Size i, numDirs, numBytes;
Tcl_Obj *libPathObj, *encodingObj, *searchPathObj;
TclNewLiteralStringObj(encodingObj, "encoding");
@@ -3712,7 +4402,7 @@ InitializeEncodingSearchPath(
Tcl_IncrRefCount(searchPathObj);
libPathObj = TclGetLibraryPath();
Tcl_IncrRefCount(libPathObj);
- TclListObjLength(NULL, libPathObj, &numDirs);
+ TclListObjLengthM(NULL, libPathObj, &numDirs);
for (i = 0; i < numDirs; i++) {
Tcl_Obj *directoryObj, *pathObj;
@@ -3733,7 +4423,7 @@ InitializeEncodingSearchPath(
if (*encodingPtr) {
((Encoding *)(*encodingPtr))->refCount++;
}
- bytes = Tcl_GetStringFromObj(searchPathObj, &numBytes);
+ bytes = TclGetStringFromObj(searchPathObj, &numBytes);
*lengthPtr = numBytes;
*valuePtr = (char *)ckalloc(numBytes + 1);
@@ -3742,6 +4432,166 @@ InitializeEncodingSearchPath(
}
/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingProfileParseName --
+ *
+ * Maps an encoding profile name to its integer equivalent.
+ *
+ * Results:
+ * TCL_OK on success or TCL_ERROR on failure.
+ *
+ * Side effects:
+ * Returns the profile enum value in *profilePtr
+ *
+ *------------------------------------------------------------------------
+ */
+int
+TclEncodingProfileNameToId(
+ Tcl_Interp *interp, /* For error messages. May be NULL */
+ const char *profileName, /* Name of profile */
+ int *profilePtr) /* Output */
+{
+ size_t i;
+ size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]);
+
+ for (i = 0; i < numProfiles; ++i) {
+ if (!strcmp(profileName, encodingProfiles[i].name)) {
+ *profilePtr = encodingProfiles[i].value;
+ return TCL_OK;
+ }
+ }
+ if (interp) {
+ Tcl_Obj *errorObj;
+ /* This code assumes at least two profiles :-) */
+ errorObj =
+ Tcl_ObjPrintf("bad profile name \"%s\": must be",
+ profileName);
+ for (i = 0; i < (numProfiles - 1); ++i) {
+ Tcl_AppendStringsToObj(
+ errorObj, " ", encodingProfiles[i].name, ",", NULL);
+ }
+ Tcl_AppendStringsToObj(
+ errorObj, " or ", encodingProfiles[numProfiles-1].name, NULL);
+
+ Tcl_SetObjResult(interp, errorObj);
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "PROFILE", profileName, NULL);
+ }
+ return TCL_ERROR;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingProfileValueToName --
+ *
+ * Maps an encoding profile value to its name.
+ *
+ * Results:
+ * Pointer to the name or NULL on failure. Caller must not make
+ * not modify the string and must make a copy to hold on to it.
+ *
+ * Side effects:
+ * None.
+ *------------------------------------------------------------------------
+ */
+const char *
+TclEncodingProfileIdToName(
+ Tcl_Interp *interp, /* For error messages. May be NULL */
+ int profileValue) /* Profile #define value */
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) {
+ if (profileValue == encodingProfiles[i].value) {
+ return encodingProfiles[i].name;
+ }
+ }
+ if (interp) {
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf(
+ "Internal error. Bad profile id \"%d\".",
+ profileValue));
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "PROFILEID", NULL);
+ }
+ return NULL;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingSetProfileFlags --
+ *
+ * Maps the flags supported in the encoding C API's to internal flags.
+ *
+ * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is
+ * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile
+ * specified.
+ *
+ * If no profile or an invalid profile is specified, it is set to
+ * the default.
+ *
+ * Results:
+ * Internal encoding flag mask.
+ *
+ * Side effects:
+ * None.
+ *
+ *------------------------------------------------------------------------
+ */
+int TclEncodingSetProfileFlags(int flags)
+{
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT);
+ } else {
+ int profile = CHANNEL_PROFILE_GET(flags);
+ switch (profile) {
+ case TCL_ENCODING_PROFILE_TCL8:
+ case TCL_ENCODING_PROFILE_STRICT:
+ case TCL_ENCODING_PROFILE_REPLACE:
+ break;
+ case 0: /* Unspecified by caller */
+ default:
+ CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_TCL8);
+ break;
+ }
+ }
+ return flags;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclGetEncodingProfiles --
+ *
+ * Get the list of supported encoding profiles.
+ *
+ * Results:
+ * None.
+ *
+ * Side effects:
+ * The list of profile names is stored in the interpreter result.
+ *
+ *------------------------------------------------------------------------
+ */
+void
+TclGetEncodingProfiles(Tcl_Interp *interp)
+{
+ size_t i, n;
+ Tcl_Obj *objPtr;
+ n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]);
+ objPtr = Tcl_NewListObj(n, NULL);
+ for (i = 0; i < n; ++i) {
+ Tcl_ListObjAppendElement(
+ interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
+ }
+ Tcl_SetObjResult(interp, objPtr);
+}
+
+/*
* Local Variables:
* mode: c
* c-basic-offset: 4