summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c791
1 files changed, 631 insertions, 160 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 61e3236..2a96383 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -188,6 +188,36 @@ static Tcl_Encoding systemEncoding = NULL;
Tcl_Encoding tclIdentityEncoding = NULL;
/*
+ * Names of encoding profiles and corresponding integer values.
+ * Keep alphabetical order for error messages.
+ */
+static struct TclEncodingProfiles {
+ const char *name;
+ int value;
+} encodingProfiles[] = {
+ {"replace", TCL_ENCODING_PROFILE_REPLACE},
+ {"strict", TCL_ENCODING_PROFILE_STRICT},
+ {"tcl8", TCL_ENCODING_PROFILE_TCL8},
+};
+#define PROFILE_TCL8(flags_) \
+ ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8) \
+ || (CHANNEL_PROFILE_GET(flags_) == 0 \
+ && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_TCL8))
+#define PROFILE_STRICT(flags_) \
+ ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \
+ || (CHANNEL_PROFILE_GET(flags_) == 0 \
+ && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT))
+#define PROFILE_REPLACE(flags_) \
+ ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) \
+ || (CHANNEL_PROFILE_GET(flags_) == 0 \
+ && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE))
+
+#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD)
+#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800)
+#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800)
+#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00)
+
+/*
* The following variable is used in the sparse matrix code for a
* TableEncoding to represent a page in the table that has no entries.
*/
@@ -230,6 +260,7 @@ static Tcl_EncodingConvertProc UtfToUtfProc;
static Tcl_EncodingConvertProc Iso88591FromUtfProc;
static Tcl_EncodingConvertProc Iso88591ToUtfProc;
+
/*
* A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
* of the internalrep. This should help the lifetime of encodings be more useful.
@@ -522,11 +553,16 @@ FillEncodingFileMap(void)
*---------------------------------------------------------------------------
*/
-/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
- * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */
-#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */
+/*
+ * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS
+ * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this
+ * when adding bits. TODO - should really be defined in a single file.
+ *
+ * To prevent conflicting bits, only define bits within 0xff00 mask here.
+ */
+#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */
#define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
-#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */
+#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */
void
TclInitEncodingSubsystem(void)
@@ -539,12 +575,16 @@ TclInitEncodingSubsystem(void)
char c;
short s;
} isLe;
+ int leFlags;
if (encodingsInitialized) {
return;
}
- isLe.s = TCL_ENCODING_LE;
+ /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */
+ isLe.s = 1;
+ leFlags = isLe.c ? TCL_ENCODING_LE : 0;
+
Tcl_MutexLock(&encodingMutex);
Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
Tcl_MutexUnlock(&encodingMutex);
@@ -585,7 +625,7 @@ TclInitEncodingSubsystem(void)
type.clientData = INT2PTR(0);
Tcl_CreateEncoding(&type);
type.encodingName = "ucs-2";
- type.clientData = INT2PTR(isLe.c);
+ type.clientData = INT2PTR(leFlags);
Tcl_CreateEncoding(&type);
type.toUtfProc = Utf32ToUtfProc;
@@ -599,7 +639,7 @@ TclInitEncodingSubsystem(void)
type.clientData = INT2PTR(0);
Tcl_CreateEncoding(&type);
type.encodingName = "utf-32";
- type.clientData = INT2PTR(isLe.c);
+ type.clientData = INT2PTR(leFlags);
Tcl_CreateEncoding(&type);
type.toUtfProc = Utf16ToUtfProc;
@@ -613,7 +653,7 @@ TclInitEncodingSubsystem(void)
type.clientData = INT2PTR(ENCODING_UTF);
Tcl_CreateEncoding(&type);
type.encodingName = "utf-16";
- type.clientData = INT2PTR(isLe.c|ENCODING_UTF);
+ type.clientData = INT2PTR(leFlags|ENCODING_UTF);
Tcl_CreateEncoding(&type);
#ifndef TCL_NO_DEPRECATED
@@ -897,7 +937,7 @@ Tcl_GetEncodingNames(
Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
Tcl_CreateHashEntry(&table,
- Tcl_NewStringObj(encodingPtr->name, -1), &dummy);
+ Tcl_NewStringObj(encodingPtr->name, TCL_INDEX_NONE), &dummy);
}
Tcl_MutexUnlock(&encodingMutex);
@@ -1114,7 +1154,8 @@ Tcl_ExternalToUtfDString(
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
- Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr);
+ Tcl_ExternalToUtfDStringEx(
+ NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL);
return Tcl_DStringValue(dstPtr);
}
@@ -1128,34 +1169,53 @@ Tcl_ExternalToUtfDString(
* The parameter flags controls the behavior, if any of the bytes in
* the source buffer are invalid or cannot be represented in utf-8.
* Possible flags values:
- * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default
- * fallback character. Always return -1 (Default in Tcl 8.7).
- * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00.
- * Only valid for "utf-8" and "cesu-8". This flag may be used together
- * with the other flags.
+ * target encoding. It should be composed by OR-ing the following:
+ * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT}
+ * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile
+ * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags
+ * Any other flag bits will cause an error to be returned (for future
+ * compatibility)
*
* Results:
- * The converted bytes are stored in the DString, which is then NULL
- * terminated in an encoding-specific manner. The return value is
- * the error position in the source string or -1 if no conversion error
- * is reported.
- *
+ * The return value is one of
+ * TCL_OK: success. Converted string in *dstPtr
+ * TCL_ERROR: error in passed parameters. Error message in interp
+ * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
+ * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition
+ * TCL_CONVERT_UNKNOWN: source contained a character that could not
+ * be represented in target encoding.
+ *
* Side effects:
- * None.
+ *
+ * TCL_OK: The converted bytes are stored in the DString and NUL
+ * terminated in an encoding-specific manner.
+ * TCL_ERROR: an error, message is stored in the interp if not NULL.
+ * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored
+ * in the interpreter (if not NULL). If errorLocPtr is not NULL,
+ * no error message is stored as it is expected the caller is
+ * interested in whatever is decoded so far and not treating this
+ * as an error condition.
+ *
+ * In addition, *dstPtr is always initialized and must be cleared
+ * by the caller irrespective of the return code.
*
*-------------------------------------------------------------------------
*/
-Tcl_Size
+int
Tcl_ExternalToUtfDStringEx(
+ Tcl_Interp *interp, /* For error messages. May be NULL. */
Tcl_Encoding encoding, /* The encoding for the source string, or NULL
* for the default system encoding. */
const char *src, /* Source string in specified encoding. */
Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* encoding-specific string length. */
int flags, /* Conversion control flags. */
- Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
+ Tcl_DString *dstPtr, /* Uninitialized or free DString in which the
* converted string is stored. */
+ Tcl_Size *errorLocPtr) /* Where to store the error location
+ (or TCL_INDEX_NONE if no error). May
+ be NULL. */
{
char *dst;
Tcl_EncodingState state;
@@ -1164,14 +1224,27 @@ Tcl_ExternalToUtfDStringEx(
Tcl_Size dstLen, soFar;
const char *srcStart = src;
+ /* DO FIRST - Must always be initialized before returning */
Tcl_DStringInit(dstPtr);
+
+ if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) {
+ /* TODO - what other flags are illegal? - See TIP 656 */
+ Tcl_SetObjResult(
+ interp,
+ Tcl_NewStringObj(
+ "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
+ TCL_INDEX_NONE));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
+ return TCL_ERROR;
+ }
+
dst = Tcl_DStringValue(dstPtr);
dstLen = dstPtr->spaceAvl - 1;
if (encoding == NULL) {
encoding = systemEncoding;
}
- encodingPtr = (Encoding *) encoding;
+ encodingPtr = (Encoding *)encoding;
if (src == NULL) {
srcLen = 0;
@@ -1213,8 +1286,31 @@ Tcl_ExternalToUtfDStringEx(
*/
if ((result != TCL_CONVERT_NOSPACE) &&
!(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) {
+ Tcl_Size nBytesProcessed = (src - srcStart);
+
Tcl_DStringSetLength(dstPtr, soFar);
- return (result == TCL_OK) ? TCL_INDEX_NONE : (Tcl_Size)(src - srcStart);
+ if (errorLocPtr) {
+ /*
+ * Do not write error message into interpreter if caller
+ * wants to know error location.
+ */
+ *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed;
+ } else {
+ /* Caller wants error message on failure */
+ if (result != TCL_OK && interp != NULL) {
+ char buf[TCL_INTEGER_SPACE];
+ snprintf(buf, sizeof(buf), "%" TCL_Z_MODIFIER "u", nBytesProcessed);
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf("unexpected byte sequence starting at index %"
+ TCL_Z_MODIFIER "u: '\\x%02X'",
+ nBytesProcessed,
+ UCHAR(srcStart[nBytesProcessed])));
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL);
+ }
+ }
+ return result;
}
flags &= ~TCL_ENCODING_START;
@@ -1319,6 +1415,9 @@ Tcl_ExternalToUtf(
}
if (!noTerminate) {
+ if ((int) dstLen < 1) {
+ return TCL_CONVERT_NOSPACE;
+ }
/*
* If there are any null characters in the middle of the buffer,
* they will converted to the UTF-8 null character (\xC0\x80). To get
@@ -1327,6 +1426,10 @@ Tcl_ExternalToUtf(
*/
dstLen--;
+ } else {
+ if (dstLen <= 0 && srcLen > 0) {
+ return TCL_CONVERT_NOSPACE;
+ }
}
if (encodingPtr->toUtfProc == UtfToUtfProc) {
flags |= ENCODING_INPUT;
@@ -1382,7 +1485,8 @@ Tcl_UtfToExternalDString(
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
- Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr);
+ Tcl_UtfToExternalDStringEx(
+ NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL);
return Tcl_DStringValue(dstPtr);
}
@@ -1395,45 +1499,73 @@ Tcl_UtfToExternalDString(
* Convert a source buffer from UTF-8 to the specified encoding.
* The parameter flags controls the behavior, if any of the bytes in
* the source buffer are invalid or cannot be represented in the
- * target encoding.
- * Possible flags values:
- * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default
- * fallback character. Always return -1 (Default in Tcl 8.7).
- * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00.
- * Only valid for "utf-8" and "cesu-8". This flag may be used together
- * with the other flags.
+ * target encoding. It should be composed by OR-ing the following:
+ * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT}
+ * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile
+ * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags
*
* Results:
- * The converted bytes are stored in the DString, which is then NULL
- * terminated in an encoding-specific manner. The return value is
- * the error position in the source string or -1 if no conversion error
- * is reported.
+ * The return value is one of
+ * TCL_OK: success. Converted string in *dstPtr
+ * TCL_ERROR: error in passed parameters. Error message in interp
+ * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
+ * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition
+ * TCL_CONVERT_UNKNOWN: source contained a character that could not
+ * be represented in target encoding.
*
* Side effects:
- * None.
+ *
+ * TCL_OK: The converted bytes are stored in the DString and NUL
+ * terminated in an encoding-specific manner
+ * TCL_ERROR: an error, message is stored in the interp if not NULL.
+ * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored
+ * in the interpreter (if not NULL). If errorLocPtr is not NULL,
+ * no error message is stored as it is expected the caller is
+ * interested in whatever is decoded so far and not treating this
+ * as an error condition.
+ *
+ * In addition, *dstPtr is always initialized and must be cleared
+ * by the caller irrespective of the return code.
*
*-------------------------------------------------------------------------
*/
-Tcl_Size
+int
Tcl_UtfToExternalDStringEx(
+ Tcl_Interp *interp, /* For error messages. May be NULL. */
Tcl_Encoding encoding, /* The encoding for the converted string, or
* NULL for the default system encoding. */
const char *src, /* Source string in UTF-8. */
Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* strlen(). */
int flags, /* Conversion control flags. */
- Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
+ Tcl_DString *dstPtr, /* Uninitialized or free DString in which the
* converted string is stored. */
+ Tcl_Size *errorLocPtr) /* Where to store the error location
+ (or TCL_INDEX_NONE if no error). May
+ be NULL. */
{
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
int result;
- Tcl_Size dstLen, soFar;
const char *srcStart = src;
+ Tcl_Size dstLen, soFar;
+ /* DO FIRST - must always be initialized on return */
Tcl_DStringInit(dstPtr);
+
+ if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) {
+ /* TODO - what other flags are illegal? - See TIP 656 */
+ Tcl_SetObjResult(
+ interp,
+ Tcl_NewStringObj(
+ "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
+ TCL_INDEX_NONE));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
+ return TCL_ERROR;
+ }
+
dst = Tcl_DStringValue(dstPtr);
dstLen = dstPtr->spaceAvl - 1;
@@ -1447,6 +1579,7 @@ Tcl_UtfToExternalDStringEx(
} else if (srcLen == TCL_INDEX_NONE) {
srcLen = strlen(src);
}
+
flags |= TCL_ENCODING_START;
while (1) {
int srcChunkLen, srcChunkRead;
@@ -1461,8 +1594,8 @@ Tcl_UtfToExternalDStringEx(
dstChunkLen = dstLen > INT_MAX ? INT_MAX : dstLen;
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
- srcChunkLen, flags, &state, dst, dstChunkLen,
- &srcChunkRead, &dstChunkWrote, &dstChunkChars);
+ srcChunkLen, flags, &state, dst, dstChunkLen,
+ &srcChunkRead, &dstChunkWrote, &dstChunkChars);
soFar = dst + dstChunkWrote - Tcl_DStringValue(dstPtr);
/* Move past the part processed in this go around */
@@ -1478,12 +1611,38 @@ Tcl_UtfToExternalDStringEx(
*/
if ((result != TCL_CONVERT_NOSPACE) &&
!(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) {
+ Tcl_Size nBytesProcessed = (src - srcStart);
size_t i = soFar + encodingPtr->nullSize - 1;
/* Loop as DStringSetLength only stores one nul byte at a time */
while (i+1 >= soFar+1) {
Tcl_DStringSetLength(dstPtr, i--);
}
- return (result == TCL_OK) ? TCL_INDEX_NONE : (Tcl_Size)(src - srcStart);
+ if (errorLocPtr) {
+ /*
+ * Do not write error message into interpreter if caller
+ * wants to know error location.
+ */
+ *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed;
+ } else {
+ /* Caller wants error message on failure */
+ if (result != TCL_OK && interp != NULL) {
+ Tcl_Size pos = Tcl_NumUtfChars(srcStart, nBytesProcessed);
+ int ucs4;
+ char buf[TCL_INTEGER_SPACE];
+ TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4);
+ snprintf(buf, sizeof(buf), "%" TCL_Z_MODIFIER "u", nBytesProcessed);
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf(
+ "unexpected character at index %" TCL_Z_MODIFIER
+ "u: 'U+%06X'",
+ pos,
+ ucs4));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE",
+ buf, NULL);
+ }
+ }
+ return result;
}
flags &= ~TCL_ENCODING_START;
@@ -1581,10 +1740,17 @@ Tcl_UtfToExternal(
dstCharsPtr = &dstChars;
}
+ if (dstLen < encodingPtr->nullSize) {
+ return TCL_CONVERT_NOSPACE;
+ }
dstLen -= encodingPtr->nullSize;
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
flags, statePtr, dst, dstLen, srcReadPtr,
dstWrotePtr, dstCharsPtr);
+ /*
+ * Buffer is terminated irrespective of result. Not sure this is
+ * reasonable but keep for historical/compatibility reasons.
+ */
memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize);
return result;
@@ -1644,7 +1810,7 @@ OpenEncodingFileChannel(
const char *name) /* The name of the encoding file on disk and
* also the name for new encoding. */
{
- Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1);
+ Tcl_Obj *nameObj = Tcl_NewStringObj(name, TCL_INDEX_NONE);
Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj);
Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath());
Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap);
@@ -1654,7 +1820,7 @@ OpenEncodingFileChannel(
TclListObjGetElementsM(NULL, searchPath, &numDirs, &dir);
Tcl_IncrRefCount(nameObj);
- Tcl_AppendToObj(fileNameObj, ".enc", -1);
+ Tcl_AppendToObj(fileNameObj, ".enc", TCL_INDEX_NONE);
Tcl_IncrRefCount(fileNameObj);
Tcl_DictObjGet(NULL, map, nameObj, &directory);
@@ -2286,6 +2452,7 @@ BinaryProc(
if (dstLen < 0) {
dstLen = 0;
}
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) {
srcLen = *dstCharsPtr;
}
@@ -2319,14 +2486,12 @@ BinaryProc(
*-------------------------------------------------------------------------
*/
-#define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN))
-
static int
UtfToUtfProc(
- void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */
+ void *clientData, /* additional flags */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
- int flags, /* Conversion control flags. */
+ int flags, /* TCL_ENCODING_* conversion control flags. */
TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
@@ -2348,12 +2513,14 @@ UtfToUtfProc(
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
int ch;
+ int profile;
result = TCL_OK;
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= 6;
}
@@ -2365,7 +2532,9 @@ UtfToUtfProc(
flags |= PTR2INT(clientData);
dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6);
+ profile = CHANNEL_PROFILE_GET(flags);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
/*
* If there is more string to follow, this will ensure that the
@@ -2382,29 +2551,35 @@ UtfToUtfProc(
if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) {
/*
* Copy 7bit characters, but skip null-bytes when we are in input
- * mode, so that they get converted to 0xC080.
+ * mode, so that they get converted to \xC0\x80.
*/
-
*dst++ = *src++;
- } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd)
- && (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED) && (!(flags & ENCODING_INPUT)
- || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
- || (flags & ENCODING_FAILINDEX))) {
- /*
- * If in input mode, and -strict or -failindex is specified: This is an error.
- */
- if ((STOPONERROR) && (flags & ENCODING_INPUT)) {
- result = TCL_CONVERT_SYNTAX;
- break;
+ } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
+ (UCHAR(src[1]) == 0x80) &&
+ (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) ||
+ PROFILE_REPLACE(profile))) {
+ /* Special sequence \xC0\x80 */
+ if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) {
+ if (PROFILE_REPLACE(profile)) {
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ src += 2;
+ } else {
+ /* PROFILE_STRICT */
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+ } else {
+ /*
+ * Convert 0xC080 to real nulls when we are in output mode,
+ * irrespective of the profile.
+ */
+ *dst++ = 0;
+ src += 2;
}
- /*
- * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'.
- */
- *dst++ = 0;
- src += 2;
} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
+ * Incomplete byte sequence.
* Always check before using TclUtfToUCS4. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
* incomplete char its bytes are made to represent themselves
@@ -2412,32 +2587,43 @@ UtfToUtfProc(
*/
if (flags & ENCODING_INPUT) {
- if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) {
- result = TCL_CONVERT_MULTIBYTE;
- break;
- }
- if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX)) {
- result = TCL_CONVERT_SYNTAX;
+ /* Incomplete bytes for modified UTF-8 target */
+ if (PROFILE_STRICT(profile)) {
+ result = (flags & TCL_ENCODING_CHAR_LIMIT)
+ ? TCL_CONVERT_MULTIBYTE
+ : TCL_CONVERT_SYNTAX;
break;
}
}
- char chbuf[2];
- chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
- TclUtfToUCS4(chbuf, &ch);
+ if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ ++src;
+ } else {
+ /* TCL_ENCODING_PROFILE_TCL8 */
+ char chbuf[2];
+ chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
+ TclUtfToUCS4(chbuf, &ch);
+ }
dst += Tcl_UniCharToUtf(ch, dst);
} else {
+ int isInvalid = 0;
size_t len = TclUtfToUCS4(src, &ch);
if (flags & ENCODING_INPUT) {
- if ((len < 2) && (ch != 0)
- && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) {
- goto utf8Syntax;
- } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)
- && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) {
- utf8Syntax:
- result = TCL_CONVERT_SYNTAX;
- break;
+ if ((len < 2) && (ch != 0)) {
+ isInvalid = 1;
+ } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
+ isInvalid = 1;
+ }
+ if (isInvalid) {
+ if (PROFILE_STRICT(profile)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ } else if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
}
}
+
const char *saveSrc = src;
src += len;
if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
@@ -2457,38 +2643,45 @@ UtfToUtfProc(
*dst++ = (char) ((ch | 0x80) & 0xBF);
continue;
#if TCL_UTF_MAX < 4
- } else if ((ch | 0x7FF) == 0xDFFF) {
+ } else if (SURROGATE(ch)) {
/*
* A surrogate character is detected, handle especially.
*/
-
- if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) && (flags & ENCODING_UTF)) {
+ if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
}
- int low = ch;
- len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
-
- if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
-
- if (STOPONERROR) {
- result = TCL_CONVERT_UNKNOWN;
- src = saveSrc;
- break;
+ if (PROFILE_REPLACE(profile)) {
+ /* TODO - is this right for cesu8 or should we fall through below? */
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ int low = ch;
+ len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0;
+
+ if ((!LOW_SURROGATE(low)) || (ch & 0x400)) {
+
+ if (PROFILE_STRICT(profile)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ goto cesu8;
}
- goto cesu8;
+ src += len;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ ch = low;
}
- src += len;
- dst += Tcl_UniCharToUtf(ch, dst);
- ch = low;
#endif
- } else if (STOPONERROR && !(flags & ENCODING_INPUT) && (((ch & ~0x7FF) == 0xD800))) {
+ } else if (PROFILE_STRICT(profile) &&
+ (!(flags & ENCODING_INPUT)) &&
+ SURROGATE(ch)) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
- } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
- && (flags & ENCODING_INPUT) && ((ch & ~0x7FF) == 0xD800)) {
+ } else if (PROFILE_STRICT(profile) &&
+ (flags & ENCODING_INPUT) &&
+ SURROGATE(ch)) {
result = TCL_CONVERT_SYNTAX;
src = saveSrc;
break;
@@ -2545,8 +2738,9 @@ Utf32ToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
- int ch, bytesLeft = srcLen % 4;
+ int ch = 0, bytesLeft = srcLen % 4;
+ flags = TclEncodingSetProfileFlags(flags);
flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
@@ -2556,12 +2750,27 @@ Utf32ToUtfProc(
/*
* Check alignment with utf-32 (4 == sizeof(UTF-32))
*/
-
if (bytesLeft != 0) {
+ /* We have a truncated code unit */
result = TCL_CONVERT_MULTIBYTE;
srcLen -= bytesLeft;
}
+#if TCL_UTF_MAX < 4
+ /*
+ * If last code point is a high surrogate, we cannot handle that yet,
+ * unless we are at the end.
+ */
+
+ if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) {
+ result = TCL_CONVERT_MULTIBYTE;
+ srcLen-= 4;
+ }
+#endif
+
srcStart = src;
srcEnd = src + srcLen;
@@ -2574,23 +2783,35 @@ Utf32ToUtfProc(
break;
}
+#if TCL_UTF_MAX < 4
+ int prev = ch;
+#endif
if (flags & TCL_ENCODING_LE) {
ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
}
- if ((unsigned)ch > 0x10FFFF) {
- if (STOPONERROR) {
- result = TCL_CONVERT_SYNTAX;
- break;
- }
- ch = 0xFFFD;
- } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
- && ((ch & ~0x7FF) == 0xD800)) {
- if (STOPONERROR) {
+#if TCL_UTF_MAX < 4
+ if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
+ /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+#endif
+
+ if ((unsigned)ch > 0x10FFFF) {
+ ch = UNICODE_REPLACE_CHAR;
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
break;
}
+ } else if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
+ result = TCL_CONVERT_SYNTAX;
+#if TCL_UTF_MAX < 4
+ ch = 0;
+#endif
+ break;
+ } else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) {
+ ch = UNICODE_REPLACE_CHAR;
}
/*
@@ -2598,30 +2819,41 @@ Utf32ToUtfProc(
* unsigned short-size data.
*/
- if ((ch > 0) && (ch < 0x80)) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (ch & 0xFF);
} else {
dst += Tcl_UniCharToUtf(ch, dst);
}
- src += sizeof(unsigned int);
+ src += 4;
+ }
+
+ /*
+ * If we had a truncated code unit at the end AND this is the last
+ * fragment AND profile is not "strict", stick FFFD in its place.
+ */
+#if TCL_UTF_MAX < 4
+ if (HIGH_SURROGATE(ch)) {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
}
+#endif
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
- /* We have a single byte left-over at the end */
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
} else {
- /* destination is not full, so we really are at the end now */
- if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
} else {
+ /* PROFILE_REPLACE or PROFILE_TCL8 */
result = TCL_OK;
- dst += Tcl_UniCharToUtf(0xFFFD, dst);
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
numChars++;
- src += bytesLeft;
+ src += bytesLeft; /* Go past truncated code unit */
}
}
}
+
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
@@ -2674,6 +2906,7 @@ UtfToUtf32Proc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -2698,11 +2931,14 @@ UtfToUtf32Proc(
break;
}
len = TclUtfToUCS4(src, &ch);
- if ((ch & ~0x7FF) == 0xD800) {
- if (STOPONERROR) {
+ if (SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
}
src += len;
if (flags & TCL_ENCODING_LE) {
@@ -2768,6 +3004,7 @@ Utf16ToUtfProc(
int result, numChars, charLimit = INT_MAX;
unsigned short ch = 0;
+ flags = TclEncodingSetProfileFlags(flags);
flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
@@ -2800,7 +3037,7 @@ Utf16ToUtfProc(
dstStart = dst;
dstEnd = dst + dstLen - TCL_UTF_MAX;
- for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+ for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) {
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
break;
@@ -2812,9 +3049,30 @@ Utf16ToUtfProc(
} else {
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
}
- if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) {
+ if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ src -= 2; /* Go back to beginning of high surrogate */
+ dst--; /* Also undo writing a single byte too much */
+ numChars--;
+ break;
+ } else if (PROFILE_REPLACE(flags)) {
+ /*
+ * Previous loop wrote a single byte to mark the high surrogate.
+ * Replace it with the replacement character. Further, restart
+ * current loop iteration since need to recheck destination space
+ * and reset processing of current character.
+ */
+ ch = UNICODE_REPLACE_CHAR;
+ dst--;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ src -= 2;
+ numChars--;
+ continue;
+ } else {
/* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
- dst += Tcl_UniCharToUtf(-1, dst);
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
}
/*
@@ -2822,34 +3080,59 @@ Utf16ToUtfProc(
* unsigned short-size data.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (ch & 0xFF);
- } else {
+ } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
+ } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) {
+ /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ } else {
+ /* PROFILE_REPLACE */
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ }
+ } else {
+ dst += Tcl_UniCharToUtf(ch, dst);
}
- src += sizeof(unsigned short);
}
- if ((ch & ~0x3FF) == 0xD800) {
- /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
- dst += Tcl_UniCharToUtf(-1, dst);
+ if (HIGH_SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ src -= 2;
+ dst--;
+ numChars--;
+ } else if (PROFILE_REPLACE(flags)) {
+ dst--;
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ } else {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
}
+
+ /*
+ * If we had a truncated code unit at the end AND this is the last
+ * fragment AND profile is not "strict", stick FFFD in its place.
+ */
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
- /* We have a single byte left-over at the end */
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
} else {
- /* destination is not full, so we really are at the end now */
- if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
} else {
+ /* PROFILE_REPLACE or PROFILE_TCL8 */
result = TCL_OK;
- dst += Tcl_UniCharToUtf(0xFFFD, dst);
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
numChars++;
- src++;
+ src++; /* Go past truncated code unit */
}
}
}
+
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
@@ -2902,6 +3185,7 @@ UtfToUtf16Proc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -2926,11 +3210,14 @@ UtfToUtf16Proc(
break;
}
len = TclUtfToUCS4(src, &ch);
- if ((ch & ~0x7FF) == 0xD800) {
- if (STOPONERROR) {
+ if (SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
}
src += len;
if (flags & TCL_ENCODING_LE) {
@@ -3004,6 +3291,7 @@ UtfToUcs2Proc(
int result, numChars, len;
Tcl_UniChar ch = 0;
+ flags = TclEncodingSetProfileFlags(flags);
flags |= PTR2INT(clientData);
srcStart = src;
srcEnd = src + srcLen;
@@ -3033,25 +3321,25 @@ UtfToUcs2Proc(
#if TCL_UTF_MAX < 4
len = TclUtfToUniChar(src, &ch);
if ((ch >= 0xD800) && (len < 3)) {
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
src += len;
src += TclUtfToUniChar(src, &ch);
- ch = 0xFFFD;
+ ch = UNICODE_REPLACE_CHAR;
}
#else
len = TclUtfToUniChar(src, &ch);
if (ch > 0xFFFF) {
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
- ch = 0xFFFD;
+ ch = UNICODE_REPLACE_CHAR;
}
#endif
- if (STOPONERROR && ((ch & ~0x7FF) == 0xD800)) {
+ if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
result = TCL_CONVERT_SYNTAX;
break;
}
@@ -3126,6 +3414,7 @@ TableToUtfProc(
const unsigned short *pageZero;
TableEncodingData *dataPtr = (TableEncodingData *)clientData;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -3149,31 +3438,47 @@ TableToUtfProc(
if (prefixBytes[byte]) {
src++;
if (src >= srcEnd) {
- src--;
- result = TCL_CONVERT_MULTIBYTE;
- break;
+ if (!(flags & TCL_ENCODING_END)) {
+ src--;
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ } else if (PROFILE_STRICT(flags)) {
+ src--;
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ } else if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ src--; /* See bug [bdcb5126c0] */
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ } else {
+ ch = toUnicode[byte][*((unsigned char *)src)];
}
- ch = toUnicode[byte][*((unsigned char *) src)];
} else {
ch = pageZero[byte];
}
if ((ch == 0) && (byte != 0)) {
- if ((flags & ENCODING_FAILINDEX)
- || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
break;
}
if (prefixBytes[byte]) {
src--;
}
- ch = (Tcl_UniChar) byte;
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ ch = (Tcl_UniChar)byte;
+ }
}
/*
* Special case for 1-byte utf chars for speed.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (char) ch;
} else {
dst += Tcl_UniCharToUtf(ch, dst);
@@ -3243,6 +3548,7 @@ TableFromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3275,11 +3581,11 @@ TableFromUtfProc(
word = fromUnicode[(ch >> 8)][ch & 0xFF];
if ((word == 0) && (ch != 0)) {
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
- word = dataPtr->fallback;
+ word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */
}
if (prefixBytes[(word >> 8)] != 0) {
if (dst + 1 > dstEnd) {
@@ -3349,6 +3655,7 @@ Iso88591ToUtfProc(
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -3372,7 +3679,7 @@ Iso88591ToUtfProc(
* Special case for 1-byte utf chars for speed.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (char) ch;
} else {
dst += Tcl_UniCharToUtf(ch, dst);
@@ -3433,6 +3740,7 @@ Iso88591FromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3463,7 +3771,7 @@ Iso88591FromUtfProc(
|| ((ch >= 0xD800) && (len < 3))
#endif
) {
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -3476,7 +3784,7 @@ Iso88591FromUtfProc(
* Plunge on, using '?' as a fallback character.
*/
- ch = (Tcl_UniChar) '?';
+ ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */
}
if (dst > dstEnd) {
@@ -3580,6 +3888,7 @@ EscapeToUtfProc(
int state, result, numChars, charLimit = INT_MAX;
const char *dstStart, *dstEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -3690,9 +3999,10 @@ EscapeToUtfProc(
if ((checked == dataPtr->numSubTables + 2)
|| (flags & TCL_ENCODING_END)) {
- if (!STOPONERROR) {
+ if (!PROFILE_STRICT(flags)) {
/*
- * Skip the unknown escape sequence.
+ * Skip the unknown escape sequence. TODO - bug?
+ * May be replace with UNICODE_REPLACE_CHAR?
*/
src += longest;
@@ -3802,6 +4112,7 @@ EscapeFromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3865,7 +4176,7 @@ EscapeFromUtfProc(
if (word == 0) {
state = oldState;
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -4159,6 +4470,166 @@ InitializeEncodingSearchPath(
}
/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingProfileParseName --
+ *
+ * Maps an encoding profile name to its integer equivalent.
+ *
+ * Results:
+ * TCL_OK on success or TCL_ERROR on failure.
+ *
+ * Side effects:
+ * Returns the profile enum value in *profilePtr
+ *
+ *------------------------------------------------------------------------
+ */
+int
+TclEncodingProfileNameToId(
+ Tcl_Interp *interp, /* For error messages. May be NULL */
+ const char *profileName, /* Name of profile */
+ int *profilePtr) /* Output */
+{
+ size_t i;
+ size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]);
+
+ for (i = 0; i < numProfiles; ++i) {
+ if (!strcmp(profileName, encodingProfiles[i].name)) {
+ *profilePtr = encodingProfiles[i].value;
+ return TCL_OK;
+ }
+ }
+ if (interp) {
+ Tcl_Obj *errorObj;
+ /* This code assumes at least two profiles :-) */
+ errorObj =
+ Tcl_ObjPrintf("bad profile name \"%s\": must be",
+ profileName);
+ for (i = 0; i < (numProfiles - 1); ++i) {
+ Tcl_AppendStringsToObj(
+ errorObj, " ", encodingProfiles[i].name, ",", NULL);
+ }
+ Tcl_AppendStringsToObj(
+ errorObj, " or ", encodingProfiles[numProfiles-1].name, NULL);
+
+ Tcl_SetObjResult(interp, errorObj);
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "PROFILE", profileName, NULL);
+ }
+ return TCL_ERROR;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingProfileValueToName --
+ *
+ * Maps an encoding profile value to its name.
+ *
+ * Results:
+ * Pointer to the name or NULL on failure. Caller must not make
+ * not modify the string and must make a copy to hold on to it.
+ *
+ * Side effects:
+ * None.
+ *------------------------------------------------------------------------
+ */
+const char *
+TclEncodingProfileIdToName(
+ Tcl_Interp *interp, /* For error messages. May be NULL */
+ int profileValue) /* Profile #define value */
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) {
+ if (profileValue == encodingProfiles[i].value) {
+ return encodingProfiles[i].name;
+ }
+ }
+ if (interp) {
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf(
+ "Internal error. Bad profile id \"%d\".",
+ profileValue));
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "PROFILEID", NULL);
+ }
+ return NULL;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingSetProfileFlags --
+ *
+ * Maps the flags supported in the encoding C API's to internal flags.
+ *
+ * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is
+ * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile
+ * specified.
+ *
+ * If no profile or an invalid profile is specified, it is set to
+ * the default.
+ *
+ * Results:
+ * Internal encoding flag mask.
+ *
+ * Side effects:
+ * None.
+ *
+ *------------------------------------------------------------------------
+ */
+int TclEncodingSetProfileFlags(int flags)
+{
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT);
+ } else {
+ int profile = CHANNEL_PROFILE_GET(flags);
+ switch (profile) {
+ case TCL_ENCODING_PROFILE_TCL8:
+ case TCL_ENCODING_PROFILE_STRICT:
+ case TCL_ENCODING_PROFILE_REPLACE:
+ break;
+ case 0: /* Unspecified by caller */
+ default:
+ CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT);
+ break;
+ }
+ }
+ return flags;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclGetEncodingProfiles --
+ *
+ * Get the list of supported encoding profiles.
+ *
+ * Results:
+ * None.
+ *
+ * Side effects:
+ * The list of profile names is stored in the interpreter result.
+ *
+ *------------------------------------------------------------------------
+ */
+void
+TclGetEncodingProfiles(Tcl_Interp *interp)
+{
+ size_t i, n;
+ Tcl_Obj *objPtr;
+ n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]);
+ objPtr = Tcl_NewListObj(n, NULL);
+ for (i = 0; i < n; ++i) {
+ Tcl_ListObjAppendElement(
+ interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
+ }
+ Tcl_SetObjResult(interp, objPtr);
+}
+
+/*
* Local Variables:
* mode: c
* c-basic-offset: 4