summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c992
1 files changed, 799 insertions, 193 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 23d71ff..a87eb7f 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -10,6 +10,7 @@
*/
#include "tclInt.h"
+#include "tclIO.h"
typedef size_t (LengthProc)(const char *src);
@@ -35,7 +36,7 @@ typedef struct {
* encoding is deleted. */
void *clientData; /* Arbitrary value associated with encoding
* type. Passed to conversion functions. */
- size_t nullSize; /* Number of 0x00 bytes that signify
+ Tcl_Size nullSize; /* Number of 0x00 bytes that signify
* end-of-string in this encoding. This number
* is used to determine the source string
* length when the srcLen argument is
@@ -187,6 +188,36 @@ static Tcl_Encoding systemEncoding = NULL;
Tcl_Encoding tclIdentityEncoding = NULL;
/*
+ * Names of encoding profiles and corresponding integer values.
+ * Keep alphabetical order for error messages.
+ */
+static struct TclEncodingProfiles {
+ const char *name;
+ int value;
+} encodingProfiles[] = {
+ {"replace", TCL_ENCODING_PROFILE_REPLACE},
+ {"strict", TCL_ENCODING_PROFILE_STRICT},
+ {"tcl8", TCL_ENCODING_PROFILE_TCL8},
+};
+#define PROFILE_TCL8(flags_) \
+ ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8) \
+ || (CHANNEL_PROFILE_GET(flags_) == 0 \
+ && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_TCL8))
+#define PROFILE_STRICT(flags_) \
+ ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \
+ || (CHANNEL_PROFILE_GET(flags_) == 0 \
+ && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT))
+#define PROFILE_REPLACE(flags_) \
+ ((CHANNEL_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) \
+ || (CHANNEL_PROFILE_GET(flags_) == 0 \
+ && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE))
+
+#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD)
+#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800)
+#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800)
+#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00)
+
+/*
* The following variable is used in the sparse matrix code for a
* TableEncoding to represent a page in the table that has no entries.
*/
@@ -229,6 +260,7 @@ static Tcl_EncodingConvertProc UtfToUtfProc;
static Tcl_EncodingConvertProc Iso88591FromUtfProc;
static Tcl_EncodingConvertProc Iso88591ToUtfProc;
+
/*
* A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
* of the internalrep. This should help the lifetime of encodings be more useful.
@@ -374,7 +406,7 @@ int
Tcl_SetEncodingSearchPath(
Tcl_Obj *searchPath)
{
- size_t dummy;
+ Tcl_Size dummy;
if (TCL_ERROR == TclListObjLengthM(NULL, searchPath, &dummy)) {
return TCL_ERROR;
@@ -421,7 +453,7 @@ void
TclSetLibraryPath(
Tcl_Obj *path)
{
- size_t dummy;
+ Tcl_Size dummy;
if (TCL_ERROR == TclListObjLengthM(NULL, path, &dummy)) {
return;
@@ -457,7 +489,7 @@ TclSetLibraryPath(
static void
FillEncodingFileMap(void)
{
- size_t i, numDirs = 0;
+ Tcl_Size i, numDirs = 0;
Tcl_Obj *map, *searchPath;
searchPath = Tcl_GetEncodingSearchPath();
@@ -472,7 +504,7 @@ FillEncodingFileMap(void)
* entries found, we favor files earlier on the search path.
*/
- size_t j, numFiles;
+ Tcl_Size j, numFiles;
Tcl_Obj *directory, *matchFileList;
Tcl_Obj **filev;
Tcl_GlobTypeData readableFiles = {
@@ -521,10 +553,16 @@ FillEncodingFileMap(void)
*---------------------------------------------------------------------------
*/
-/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
- * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */
-#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */
-#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
+/*
+ * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS
+ * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this
+ * when adding bits. TODO - should really be defined in a single file.
+ *
+ * To prevent conflicting bits, only define bits within 0xff00 mask here.
+ */
+#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */
+#define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
+#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */
void
TclInitEncodingSubsystem(void)
@@ -537,12 +575,16 @@ TclInitEncodingSubsystem(void)
char c;
short s;
} isLe;
+ int leFlags;
if (encodingsInitialized) {
return;
}
- isLe.s = TCL_ENCODING_LE;
+ /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */
+ isLe.s = 1;
+ leFlags = isLe.c ? TCL_ENCODING_LE : 0;
+
Tcl_MutexLock(&encodingMutex);
Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
Tcl_MutexUnlock(&encodingMutex);
@@ -566,9 +608,9 @@ TclInitEncodingSubsystem(void)
type.fromUtfProc = UtfToUtfProc;
type.freeProc = NULL;
type.nullSize = 1;
- type.clientData = INT2PTR(TCL_ENCODING_UTF);
+ type.clientData = INT2PTR(ENCODING_UTF);
Tcl_CreateEncoding(&type);
- type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN);
+ type.clientData = INT2PTR(0);
type.encodingName = "cesu-8";
Tcl_CreateEncoding(&type);
@@ -577,13 +619,13 @@ TclInitEncodingSubsystem(void)
type.freeProc = NULL;
type.nullSize = 2;
type.encodingName = "ucs-2le";
- type.clientData = INT2PTR(TCL_ENCODING_LE|TCL_ENCODING_NOCOMPLAIN);
+ type.clientData = INT2PTR(TCL_ENCODING_LE);
Tcl_CreateEncoding(&type);
type.encodingName = "ucs-2be";
- type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN);
+ type.clientData = INT2PTR(0);
Tcl_CreateEncoding(&type);
type.encodingName = "ucs-2";
- type.clientData = INT2PTR(isLe.c|TCL_ENCODING_NOCOMPLAIN);
+ type.clientData = INT2PTR(leFlags);
Tcl_CreateEncoding(&type);
type.toUtfProc = Utf32ToUtfProc;
@@ -597,7 +639,7 @@ TclInitEncodingSubsystem(void)
type.clientData = INT2PTR(0);
Tcl_CreateEncoding(&type);
type.encodingName = "utf-32";
- type.clientData = INT2PTR(isLe.c);
+ type.clientData = INT2PTR(leFlags);
Tcl_CreateEncoding(&type);
type.toUtfProc = Utf16ToUtfProc;
@@ -605,13 +647,13 @@ TclInitEncodingSubsystem(void)
type.freeProc = NULL;
type.nullSize = 2;
type.encodingName = "utf-16le";
- type.clientData = INT2PTR(TCL_ENCODING_LE);
+ type.clientData = INT2PTR(TCL_ENCODING_LE|ENCODING_UTF);
Tcl_CreateEncoding(&type);
type.encodingName = "utf-16be";
- type.clientData = INT2PTR(0);
+ type.clientData = INT2PTR(ENCODING_UTF);
Tcl_CreateEncoding(&type);
type.encodingName = "utf-16";
- type.clientData = INT2PTR(isLe.c);
+ type.clientData = INT2PTR(leFlags|ENCODING_UTF);
Tcl_CreateEncoding(&type);
#ifndef TCL_NO_DEPRECATED
@@ -895,7 +937,7 @@ Tcl_GetEncodingNames(
Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
Tcl_CreateHashEntry(&table,
- Tcl_NewStringObj(encodingPtr->name, -1), &dummy);
+ Tcl_NewStringObj(encodingPtr->name, TCL_INDEX_NONE), &dummy);
}
Tcl_MutexUnlock(&encodingMutex);
@@ -940,7 +982,7 @@ Tcl_GetEncodingNames(
*
*---------------------------------------------------------------------------
*/
-size_t
+Tcl_Size
Tcl_GetEncodingNulLength(
Tcl_Encoding encoding)
{
@@ -1107,12 +1149,13 @@ Tcl_ExternalToUtfDString(
Tcl_Encoding encoding, /* The encoding for the source string, or NULL
* for the default system encoding. */
const char *src, /* Source string in specified encoding. */
- size_t srcLen, /* Source string length in bytes, or -1 for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* encoding-specific string length. */
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
- Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr);
+ Tcl_ExternalToUtfDStringEx(
+ NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL);
return Tcl_DStringValue(dstPtr);
}
@@ -1126,50 +1169,82 @@ Tcl_ExternalToUtfDString(
* The parameter flags controls the behavior, if any of the bytes in
* the source buffer are invalid or cannot be represented in utf-8.
* Possible flags values:
- * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default
- * fallback character. Always return -1 (Default in Tcl 8.7).
- * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00.
- * Only valid for "utf-8" and "cesu-8". This flag may be used together
- * with the other flags.
+ * target encoding. It should be composed by OR-ing the following:
+ * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT}
+ * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile
+ * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags
+ * Any other flag bits will cause an error to be returned (for future
+ * compatibility)
*
* Results:
- * The converted bytes are stored in the DString, which is then NULL
- * terminated in an encoding-specific manner. The return value is
- * the error position in the source string or -1 if no conversion error
- * is reported.
- *
+ * The return value is one of
+ * TCL_OK: success. Converted string in *dstPtr
+ * TCL_ERROR: error in passed parameters. Error message in interp
+ * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
+ * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition
+ * TCL_CONVERT_UNKNOWN: source contained a character that could not
+ * be represented in target encoding.
+ *
* Side effects:
- * None.
+ *
+ * TCL_OK: The converted bytes are stored in the DString and NUL
+ * terminated in an encoding-specific manner.
+ * TCL_ERROR: an error, message is stored in the interp if not NULL.
+ * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored
+ * in the interpreter (if not NULL). If errorLocPtr is not NULL,
+ * no error message is stored as it is expected the caller is
+ * interested in whatever is decoded so far and not treating this
+ * as an error condition.
+ *
+ * In addition, *dstPtr is always initialized and must be cleared
+ * by the caller irrespective of the return code.
*
*-------------------------------------------------------------------------
*/
-size_t
+int
Tcl_ExternalToUtfDStringEx(
+ Tcl_Interp *interp, /* For error messages. May be NULL. */
Tcl_Encoding encoding, /* The encoding for the source string, or NULL
* for the default system encoding. */
const char *src, /* Source string in specified encoding. */
- size_t srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* encoding-specific string length. */
int flags, /* Conversion control flags. */
- Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
+ Tcl_DString *dstPtr, /* Uninitialized or free DString in which the
* converted string is stored. */
+ Tcl_Size *errorLocPtr) /* Where to store the error location
+ (or TCL_INDEX_NONE if no error). May
+ be NULL. */
{
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
- int result, soFar, srcRead, dstWrote, dstChars;
- size_t dstLen;
+ int result;
+ Tcl_Size dstLen, soFar;
const char *srcStart = src;
+ /* DO FIRST - Must always be initialized before returning */
Tcl_DStringInit(dstPtr);
+
+ if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) {
+ /* TODO - what other flags are illegal? - See TIP 656 */
+ Tcl_SetObjResult(
+ interp,
+ Tcl_NewStringObj(
+ "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
+ TCL_INDEX_NONE));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
+ return TCL_ERROR;
+ }
+
dst = Tcl_DStringValue(dstPtr);
dstLen = dstPtr->spaceAvl - 1;
if (encoding == NULL) {
encoding = systemEncoding;
}
- encodingPtr = (Encoding *) encoding;
+ encodingPtr = (Encoding *)encoding;
if (src == NULL) {
srcLen = 0;
@@ -1177,23 +1252,70 @@ Tcl_ExternalToUtfDStringEx(
srcLen = encodingPtr->lengthProc(src);
}
- flags |= TCL_ENCODING_START | TCL_ENCODING_END;
+ flags |= TCL_ENCODING_START;
if (encodingPtr->toUtfProc == UtfToUtfProc) {
- flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+ flags |= ENCODING_INPUT;
}
while (1) {
- result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
- flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars);
- soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
+ int srcChunkLen, srcChunkRead;
+ int dstChunkLen, dstChunkWrote, dstChunkChars;
+
+ if (srcLen > INT_MAX) {
+ srcChunkLen = INT_MAX;
+ } else {
+ srcChunkLen = srcLen;
+ flags |= TCL_ENCODING_END; /* Last chunk */
+ }
+ dstChunkLen = dstLen > INT_MAX ? INT_MAX : dstLen;
+
+ result = encodingPtr->toUtfProc(encodingPtr->clientData, src,
+ srcChunkLen, flags, &state, dst, dstChunkLen,
+ &srcChunkRead, &dstChunkWrote, &dstChunkChars);
+ soFar = dst + dstChunkWrote - Tcl_DStringValue(dstPtr);
+
+ src += srcChunkRead;
+
+ /*
+ * Keep looping in two case -
+ * - our destination buffer did not have enough room
+ * - we had not passed in all the data and error indicated fragment
+ * of a multibyte character
+ * In both cases we have to grow buffer, move the input source pointer
+ * and loop. Otherwise, return the result we got.
+ */
+ if ((result != TCL_CONVERT_NOSPACE) &&
+ !(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) {
+ Tcl_Size nBytesProcessed = (src - srcStart);
- src += srcRead;
- if (result != TCL_CONVERT_NOSPACE) {
Tcl_DStringSetLength(dstPtr, soFar);
- return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart);
+ if (errorLocPtr) {
+ /*
+ * Do not write error message into interpreter if caller
+ * wants to know error location.
+ */
+ *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed;
+ } else {
+ /* Caller wants error message on failure */
+ if (result != TCL_OK && interp != NULL) {
+ char buf[TCL_INTEGER_SPACE];
+ sprintf(buf, "%" TCL_Z_MODIFIER "u", nBytesProcessed);
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf("unexpected byte sequence starting at index %"
+ TCL_Z_MODIFIER "u: '\\x%02X'",
+ nBytesProcessed,
+ UCHAR(srcStart[nBytesProcessed])));
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL);
+ }
+ }
+ return result;
}
+
flags &= ~TCL_ENCODING_START;
- srcLen -= srcRead;
+ srcLen -= srcChunkRead;
+
if (Tcl_DStringLength(dstPtr) == 0) {
Tcl_DStringSetLength(dstPtr, dstLen);
}
@@ -1227,8 +1349,8 @@ Tcl_ExternalToUtf(
Tcl_Encoding encoding, /* The encoding for the source string, or NULL
* for the default system encoding. */
const char *src, /* Source string in specified encoding. */
- size_t srcLen, /* Source string length in bytes, or -1
- * for encoding-specific string length. */
+ Tcl_Size srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for
+ * encoding-specific string length. */
int flags, /* Conversion control flags. */
Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
* information used during a piecewise
@@ -1237,7 +1359,7 @@ Tcl_ExternalToUtf(
* routine under control of flags argument. */
char *dst, /* Output buffer in which converted string is
* stored. */
- size_t dstLen, /* The maximum length of output buffer in
+ Tcl_Size dstLen, /* The maximum length of output buffer in
* bytes. */
int *srcReadPtr, /* Filled with the number of bytes from the
* source string that were converted. This may
@@ -1272,6 +1394,13 @@ Tcl_ExternalToUtf(
flags |= TCL_ENCODING_START | TCL_ENCODING_END;
statePtr = &state;
}
+ if (srcLen > INT_MAX) {
+ srcLen = INT_MAX;
+ flags &= ~TCL_ENCODING_END;
+ }
+ if (dstLen > INT_MAX) {
+ dstLen = INT_MAX;
+ }
if (srcReadPtr == NULL) {
srcReadPtr = &srcRead;
}
@@ -1286,6 +1415,9 @@ Tcl_ExternalToUtf(
}
if (!noTerminate) {
+ if ((int) dstLen < 1) {
+ return TCL_CONVERT_NOSPACE;
+ }
/*
* If there are any null characters in the middle of the buffer,
* they will converted to the UTF-8 null character (\xC0\x80). To get
@@ -1294,9 +1426,13 @@ Tcl_ExternalToUtf(
*/
dstLen--;
+ } else {
+ if (dstLen <= 0 && srcLen > 0) {
+ return TCL_CONVERT_NOSPACE;
+ }
}
if (encodingPtr->toUtfProc == UtfToUtfProc) {
- flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+ flags |= ENCODING_INPUT;
}
do {
Tcl_EncodingState savedState = *statePtr;
@@ -1344,12 +1480,13 @@ Tcl_UtfToExternalDString(
Tcl_Encoding encoding, /* The encoding for the converted string, or
* NULL for the default system encoding. */
const char *src, /* Source string in UTF-8. */
- size_t srcLen, /* Source string length in bytes, or -1 for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* strlen(). */
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
- Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr);
+ Tcl_UtfToExternalDStringEx(
+ NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL);
return Tcl_DStringValue(dstPtr);
}
@@ -1362,45 +1499,73 @@ Tcl_UtfToExternalDString(
* Convert a source buffer from UTF-8 to the specified encoding.
* The parameter flags controls the behavior, if any of the bytes in
* the source buffer are invalid or cannot be represented in the
- * target encoding.
- * Possible flags values:
- * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default
- * fallback character. Always return -1 (Default in Tcl 8.7).
- * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00.
- * Only valid for "utf-8" and "cesu-8". This flag may be used together
- * with the other flags.
+ * target encoding. It should be composed by OR-ing the following:
+ * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT}
+ * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile
+ * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags
*
* Results:
- * The converted bytes are stored in the DString, which is then NULL
- * terminated in an encoding-specific manner. The return value is
- * the error position in the source string or -1 if no conversion error
- * is reported.
+ * The return value is one of
+ * TCL_OK: success. Converted string in *dstPtr
+ * TCL_ERROR: error in passed parameters. Error message in interp
+ * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
+ * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition
+ * TCL_CONVERT_UNKNOWN: source contained a character that could not
+ * be represented in target encoding.
*
* Side effects:
- * None.
+ *
+ * TCL_OK: The converted bytes are stored in the DString and NUL
+ * terminated in an encoding-specific manner
+ * TCL_ERROR: an error, message is stored in the interp if not NULL.
+ * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored
+ * in the interpreter (if not NULL). If errorLocPtr is not NULL,
+ * no error message is stored as it is expected the caller is
+ * interested in whatever is decoded so far and not treating this
+ * as an error condition.
+ *
+ * In addition, *dstPtr is always initialized and must be cleared
+ * by the caller irrespective of the return code.
*
*-------------------------------------------------------------------------
*/
-size_t
+int
Tcl_UtfToExternalDStringEx(
+ Tcl_Interp *interp, /* For error messages. May be NULL. */
Tcl_Encoding encoding, /* The encoding for the converted string, or
* NULL for the default system encoding. */
const char *src, /* Source string in UTF-8. */
- size_t srcLen, /* Source string length in bytes, or < 0 for
+ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for
* strlen(). */
- int flags, /* Conversion control flags. */
- Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
+ int flags, /* Conversion control flags. */
+ Tcl_DString *dstPtr, /* Uninitialized or free DString in which the
* converted string is stored. */
+ Tcl_Size *errorLocPtr) /* Where to store the error location
+ (or TCL_INDEX_NONE if no error). May
+ be NULL. */
{
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
- int result, soFar, srcRead, dstWrote, dstChars;
+ int result;
const char *srcStart = src;
- size_t dstLen;
+ Tcl_Size dstLen, soFar;
+ /* DO FIRST - must always be initialized on return */
Tcl_DStringInit(dstPtr);
+
+ if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) {
+ /* TODO - what other flags are illegal? - See TIP 656 */
+ Tcl_SetObjResult(
+ interp,
+ Tcl_NewStringObj(
+ "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
+ TCL_INDEX_NONE));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
+ return TCL_ERROR;
+ }
+
dst = Tcl_DStringValue(dstPtr);
dstLen = dstPtr->spaceAvl - 1;
@@ -1414,24 +1579,75 @@ Tcl_UtfToExternalDStringEx(
} else if (srcLen == TCL_INDEX_NONE) {
srcLen = strlen(src);
}
- flags |= TCL_ENCODING_START | TCL_ENCODING_END;
+
+ flags |= TCL_ENCODING_START;
while (1) {
+ int srcChunkLen, srcChunkRead;
+ int dstChunkLen, dstChunkWrote, dstChunkChars;
+
+ if (srcLen > INT_MAX) {
+ srcChunkLen = INT_MAX;
+ } else {
+ srcChunkLen = srcLen;
+ flags |= TCL_ENCODING_END; /* Last chunk */
+ }
+ dstChunkLen = dstLen > INT_MAX ? INT_MAX : dstLen;
+
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
- srcLen, flags, &state, dst, dstLen,
- &srcRead, &dstWrote, &dstChars);
- soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
-
- src += srcRead;
- if (result != TCL_CONVERT_NOSPACE) {
- int i = soFar + encodingPtr->nullSize - 1;
- while (i >= soFar) {
+ srcChunkLen, flags, &state, dst, dstChunkLen,
+ &srcChunkRead, &dstChunkWrote, &dstChunkChars);
+ soFar = dst + dstChunkWrote - Tcl_DStringValue(dstPtr);
+
+ /* Move past the part processed in this go around */
+ src += srcChunkRead;
+
+ /*
+ * Keep looping in two case -
+ * - our destination buffer did not have enough room
+ * - we had not passed in all the data and error indicated fragment
+ * of a multibyte character
+ * In both cases we have to grow buffer, move the input source pointer
+ * and loop. Otherwise, return the result we got.
+ */
+ if ((result != TCL_CONVERT_NOSPACE) &&
+ !(result == TCL_CONVERT_MULTIBYTE && (flags & TCL_ENCODING_END))) {
+ Tcl_Size nBytesProcessed = (src - srcStart);
+ size_t i = soFar + encodingPtr->nullSize - 1;
+ /* Loop as DStringSetLength only stores one nul byte at a time */
+ while (i+1 >= soFar+1) {
Tcl_DStringSetLength(dstPtr, i--);
}
- return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart);
+ if (errorLocPtr) {
+ /*
+ * Do not write error message into interpreter if caller
+ * wants to know error location.
+ */
+ *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed;
+ } else {
+ /* Caller wants error message on failure */
+ if (result != TCL_OK && interp != NULL) {
+ Tcl_Size pos = Tcl_NumUtfChars(srcStart, nBytesProcessed);
+ int ucs4;
+ char buf[TCL_INTEGER_SPACE];
+ TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4);
+ sprintf(buf, "%" TCL_Z_MODIFIER "u", nBytesProcessed);
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf(
+ "unexpected character at index %" TCL_Z_MODIFIER
+ "u: 'U+%06X'",
+ pos,
+ ucs4));
+ Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE",
+ buf, NULL);
+ }
+ }
+ return result;
}
flags &= ~TCL_ENCODING_START;
- srcLen -= srcRead;
+ srcLen -= srcChunkRead;
+
if (Tcl_DStringLength(dstPtr) == 0) {
Tcl_DStringSetLength(dstPtr, dstLen);
}
@@ -1465,8 +1681,8 @@ Tcl_UtfToExternal(
Tcl_Encoding encoding, /* The encoding for the converted string, or
* NULL for the default system encoding. */
const char *src, /* Source string in UTF-8. */
- size_t srcLen, /* Source string length in bytes, or -1
- * for strlen(). */
+ Tcl_Size srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for
+ * strlen(). */
int flags, /* Conversion control flags. */
Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
* information used during a piecewise
@@ -1475,7 +1691,7 @@ Tcl_UtfToExternal(
* routine under control of flags argument. */
char *dst, /* Output buffer in which converted string
* is stored. */
- size_t dstLen, /* The maximum length of output buffer in
+ Tcl_Size dstLen, /* The maximum length of output buffer in
* bytes. */
int *srcReadPtr, /* Filled with the number of bytes from the
* source string that were converted. This may
@@ -1507,6 +1723,13 @@ Tcl_UtfToExternal(
flags |= TCL_ENCODING_START | TCL_ENCODING_END;
statePtr = &state;
}
+ if (srcLen > INT_MAX) {
+ srcLen = INT_MAX;
+ flags &= ~TCL_ENCODING_END;
+ }
+ if (dstLen > INT_MAX) {
+ dstLen = INT_MAX;
+ }
if (srcReadPtr == NULL) {
srcReadPtr = &srcRead;
}
@@ -1517,10 +1740,17 @@ Tcl_UtfToExternal(
dstCharsPtr = &dstChars;
}
+ if (dstLen < encodingPtr->nullSize) {
+ return TCL_CONVERT_NOSPACE;
+ }
dstLen -= encodingPtr->nullSize;
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
flags, statePtr, dst, dstLen, srcReadPtr,
dstWrotePtr, dstCharsPtr);
+ /*
+ * Buffer is terminated irrespective of result. Not sure this is
+ * reasonable but keep for historical/compatibility reasons.
+ */
memset(&dst[*dstWrotePtr], '\0', encodingPtr->nullSize);
return result;
@@ -1580,17 +1810,17 @@ OpenEncodingFileChannel(
const char *name) /* The name of the encoding file on disk and
* also the name for new encoding. */
{
- Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1);
+ Tcl_Obj *nameObj = Tcl_NewStringObj(name, TCL_INDEX_NONE);
Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj);
Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath());
Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap);
Tcl_Obj **dir, *path, *directory = NULL;
Tcl_Channel chan = NULL;
- size_t i, numDirs;
+ Tcl_Size i, numDirs;
TclListObjGetElementsM(NULL, searchPath, &numDirs, &dir);
Tcl_IncrRefCount(nameObj);
- Tcl_AppendToObj(fileNameObj, ".enc", -1);
+ Tcl_AppendToObj(fileNameObj, ".enc", TCL_INDEX_NONE);
Tcl_IncrRefCount(fileNameObj);
Tcl_DictObjGet(NULL, map, nameObj, &directory);
@@ -1851,7 +2081,7 @@ LoadTableEncoding(
for (i = 0; i < numPages; i++) {
int ch;
const char *p;
- size_t expected = 3 + 16 * (16 * 4 + 1);
+ Tcl_Size expected = 3 + 16 * (16 * 4 + 1);
if (Tcl_ReadChars(chan, objPtr, expected, 0) != expected) {
return NULL;
@@ -2087,7 +2317,7 @@ LoadEscapeEncoding(
Tcl_DStringInit(&escapeData);
while (1) {
- size_t argc;
+ Tcl_Size argc;
const char **argv;
char *line;
Tcl_DString lineString;
@@ -2135,7 +2365,7 @@ LoadEscapeEncoding(
Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est));
}
}
- Tcl_Free((void *)argv);
+ Tcl_Free(argv);
Tcl_DStringFree(&lineString);
}
@@ -2222,6 +2452,7 @@ BinaryProc(
if (dstLen < 0) {
dstLen = 0;
}
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) {
srcLen = *dstCharsPtr;
}
@@ -2242,9 +2473,9 @@ BinaryProc(
*
* UtfToUtfProc --
*
- * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
- * is not a no-op, because it will turn a stream of improperly formed
- * UTF-8 into a properly formed stream.
+ * Converts from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
+ * is not a no-op, because it turns a stream of improperly formed
+ * UTF-8 into a properly-formed stream.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2255,14 +2486,12 @@ BinaryProc(
*-------------------------------------------------------------------------
*/
-#define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN))
-
static int
UtfToUtfProc(
- void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */
+ void *clientData, /* additional flags */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
- int flags, /* Conversion control flags. */
+ int flags, /* TCL_ENCODING_* conversion control flags. */
TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
@@ -2284,12 +2513,14 @@ UtfToUtfProc(
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
int ch;
+ int profile;
result = TCL_OK;
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= 6;
}
@@ -2299,9 +2530,11 @@ UtfToUtfProc(
dstStart = dst;
flags |= PTR2INT(clientData);
- dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
+ dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6);
+ profile = CHANNEL_PROFILE_GET(flags);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
/*
* If there is more string to follow, this will ensure that the
@@ -2315,62 +2548,85 @@ UtfToUtfProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) {
+ if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) {
/*
* Copy 7bit characters, but skip null-bytes when we are in input
- * mode, so that they get converted to 0xC080.
+ * mode, so that they get converted to \xC0\x80.
*/
-
*dst++ = *src++;
- } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd)
- && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
- /*
- * If in input mode, and -strict is specified: This is an error.
- */
- if (flags & TCL_ENCODING_MODIFIED) {
- result = TCL_CONVERT_SYNTAX;
- break;
+ } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
+ (UCHAR(src[1]) == 0x80) &&
+ (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) ||
+ PROFILE_REPLACE(profile))) {
+ /* Special sequence \xC0\x80 */
+ if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) {
+ if (PROFILE_REPLACE(profile)) {
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ src += 2;
+ } else {
+ /* PROFILE_STRICT */
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+ } else {
+ /*
+ * Convert 0xC080 to real nulls when we are in output mode,
+ * irrespective of the profile.
+ */
+ *dst++ = 0;
+ src += 2;
}
- /*
- * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'.
- */
- *dst++ = 0;
- src += 2;
} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
+ * Incomplete byte sequence.
* Always check before using TclUtfToUCS4. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
* incomplete char its bytes are made to represent themselves
* unless the user has explicitly asked to be told.
*/
- if (flags & TCL_ENCODING_MODIFIED) {
- if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) {
- result = TCL_CONVERT_MULTIBYTE;
+ if (flags & ENCODING_INPUT) {
+ /* Incomplete bytes for modified UTF-8 target */
+ if (PROFILE_STRICT(profile)) {
+ result = (flags & TCL_ENCODING_CHAR_LIMIT)
+ ? TCL_CONVERT_MULTIBYTE
+ : TCL_CONVERT_SYNTAX;
break;
}
- if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
- result = TCL_CONVERT_SYNTAX;
- break;
}
- ch = UCHAR(*src++);
+ if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ ++src;
} else {
+ /* TCL_ENCODING_PROFILE_TCL8 */
char chbuf[2];
chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
TclUtfToUCS4(chbuf, &ch);
}
dst += Tcl_UniCharToUtf(ch, dst);
} else {
- const char *saveSrc = src;
+ int isInvalid = 0;
size_t len = TclUtfToUCS4(src, &ch);
- if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)
- && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
- result = TCL_CONVERT_SYNTAX;
- break;
+ if (flags & ENCODING_INPUT) {
+ if ((len < 2) && (ch != 0)) {
+ isInvalid = 1;
+ } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
+ isInvalid = 1;
+ }
+ if (isInvalid) {
+ if (PROFILE_STRICT(profile)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ } else if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
+ }
}
+
+ const char *saveSrc = src;
src += len;
- if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) {
+ if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
if (ch > 0xFFFF) {
/* CESU-8 6-byte sequence for chars > U+FFFF */
ch -= 0x10000;
@@ -2387,33 +2643,45 @@ UtfToUtfProc(
*dst++ = (char) ((ch | 0x80) & 0xBF);
continue;
#if TCL_UTF_MAX < 4
- } else if ((ch | 0x7FF) == 0xDFFF) {
+ } else if (SURROGATE(ch)) {
/*
* A surrogate character is detected, handle especially.
*/
-
- int low = ch;
- len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
-
- if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
-
- if (STOPONERROR) {
- result = TCL_CONVERT_UNKNOWN;
- src = saveSrc;
- break;
+ if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (PROFILE_REPLACE(profile)) {
+ /* TODO - is this right for cesu8 or should we fall through below? */
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ int low = ch;
+ len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0;
+
+ if ((!LOW_SURROGATE(low)) || (ch & 0x400)) {
+
+ if (PROFILE_STRICT(profile)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ goto cesu8;
}
- goto cesu8;
+ src += len;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ ch = low;
}
- src += len;
- dst += Tcl_UniCharToUtf(ch, dst);
- ch = low;
#endif
- } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) {
+ } else if (PROFILE_STRICT(profile) &&
+ (!(flags & ENCODING_INPUT)) &&
+ SURROGATE(ch)) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
- } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
- && (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) {
+ } else if (PROFILE_STRICT(profile) &&
+ (flags & ENCODING_INPUT) &&
+ SURROGATE(ch)) {
result = TCL_CONVERT_SYNTAX;
src = saveSrc;
break;
@@ -2470,8 +2738,9 @@ Utf32ToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
- int ch;
+ int ch = 0, bytesLeft = srcLen % 4;
+ flags = TclEncodingSetProfileFlags(flags);
flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
@@ -2481,11 +2750,26 @@ Utf32ToUtfProc(
/*
* Check alignment with utf-32 (4 == sizeof(UTF-32))
*/
+ if (bytesLeft != 0) {
+ /* We have a truncated code unit */
+ result = TCL_CONVERT_MULTIBYTE;
+ srcLen -= bytesLeft;
+ }
- if ((srcLen % 4) != 0) {
+#if TCL_UTF_MAX < 4
+ /*
+ * If last code point is a high surrogate, we cannot handle that yet,
+ * unless we are at the end.
+ */
+
+ if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) {
result = TCL_CONVERT_MULTIBYTE;
- srcLen &= -4;
+ srcLen-= 4;
}
+#endif
srcStart = src;
srcEnd = src + srcLen;
@@ -2499,17 +2783,35 @@ Utf32ToUtfProc(
break;
}
+#if TCL_UTF_MAX < 4
+ int prev = ch;
+#endif
if (flags & TCL_ENCODING_LE) {
ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
}
- if ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
- && ((ch & ~0x7FF) == 0xD800))) {
- if (STOPONERROR) {
+#if TCL_UTF_MAX < 4
+ if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
+ /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+#endif
+
+ if ((unsigned)ch > 0x10FFFF) {
+ ch = UNICODE_REPLACE_CHAR;
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
break;
}
+ } else if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
+ result = TCL_CONVERT_SYNTAX;
+#if TCL_UTF_MAX < 4
+ ch = 0;
+#endif
+ break;
+ } else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) {
+ ch = UNICODE_REPLACE_CHAR;
}
/*
@@ -2517,12 +2819,39 @@ Utf32ToUtfProc(
* unsigned short-size data.
*/
- if ((ch > 0) && (ch < 0x80)) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (ch & 0xFF);
} else {
dst += Tcl_UniCharToUtf(ch, dst);
}
- src += sizeof(unsigned int);
+ src += 4;
+ }
+
+ /*
+ * If we had a truncated code unit at the end AND this is the last
+ * fragment AND profile is not "strict", stick FFFD in its place.
+ */
+#if TCL_UTF_MAX < 4
+ if (HIGH_SURROGATE(ch)) {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+#endif
+
+ if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ } else {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ } else {
+ /* PROFILE_REPLACE or PROFILE_TCL8 */
+ result = TCL_OK;
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ numChars++;
+ src += bytesLeft; /* Go past truncated code unit */
+ }
+ }
}
*srcReadPtr = src - srcStart;
@@ -2577,6 +2906,7 @@ UtfToUtf32Proc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -2601,11 +2931,14 @@ UtfToUtf32Proc(
break;
}
len = TclUtfToUCS4(src, &ch);
- if ((ch & ~0x7FF) == 0xD800) {
- if (STOPONERROR) {
+ if (SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
}
src += len;
if (flags & TCL_ENCODING_LE) {
@@ -2669,8 +3002,9 @@ Utf16ToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
- unsigned short ch;
+ unsigned short ch = 0;
+ flags = TclEncodingSetProfileFlags(flags);
flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
@@ -2687,10 +3021,11 @@ Utf16ToUtfProc(
}
/*
- * If last code point is a high surrogate, we cannot handle that yet.
+ * If last code point is a high surrogate, we cannot handle that yet,
+ * unless we are at the end.
*/
- if ((srcLen >= 2) &&
+ if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) &&
((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
result = TCL_CONVERT_MULTIBYTE;
srcLen-= 2;
@@ -2702,29 +3037,100 @@ Utf16ToUtfProc(
dstStart = dst;
dstEnd = dst + dstLen - TCL_UTF_MAX;
- for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+ for (numChars = 0; src < srcEnd && numChars <= charLimit; src += 2, numChars++) {
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
break;
}
+ unsigned short prev = ch;
if (flags & TCL_ENCODING_LE) {
ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
}
+ if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ src -= 2; /* Go back to beginning of high surrogate */
+ dst--; /* Also undo writing a single byte too much */
+ numChars--;
+ break;
+ } else if (PROFILE_REPLACE(flags)) {
+ /*
+ * Previous loop wrote a single byte to mark the high surrogate.
+ * Replace it with the replacement character. Further, restart
+ * current loop iteration since need to recheck destination space
+ * and reset processing of current character.
+ */
+ ch = UNICODE_REPLACE_CHAR;
+ dst--;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ src -= 2;
+ numChars--;
+ continue;
+ } else {
+ /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+ }
/*
* Special case for 1-byte utf chars for speed. Make sure we work with
* unsigned short-size data.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (ch & 0xFF);
- } else {
+ } else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
+ } else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) {
+ /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ } else {
+ /* PROFILE_REPLACE */
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ }
+ } else {
+ dst += Tcl_UniCharToUtf(ch, dst);
+ }
+ }
+
+ if (HIGH_SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ src -= 2;
+ dst--;
+ numChars--;
+ } else if (PROFILE_REPLACE(flags)) {
+ dst--;
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ } else {
+ /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
+ dst += Tcl_UniCharToUtf(-1, dst);
+ }
+ }
+
+ /*
+ * If we had a truncated code unit at the end AND this is the last
+ * fragment AND profile is not "strict", stick FFFD in its place.
+ */
+ if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ } else {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_SYNTAX;
+ } else {
+ /* PROFILE_REPLACE or PROFILE_TCL8 */
+ result = TCL_OK;
+ dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
+ numChars++;
+ src++; /* Go past truncated code unit */
+ }
}
- src += sizeof(unsigned short);
}
*srcReadPtr = src - srcStart;
@@ -2779,6 +3185,7 @@ UtfToUtf16Proc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -2803,11 +3210,14 @@ UtfToUtf16Proc(
break;
}
len = TclUtfToUCS4(src, &ch);
- if ((ch & ~0x7FF) == 0xD800) {
- if (STOPONERROR) {
+ if (SURROGATE(ch)) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
}
src += len;
if (flags & TCL_ENCODING_LE) {
@@ -2878,12 +3288,10 @@ UtfToUcs2Proc(
* output buffer. */
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
- int result, numChars;
-#if TCL_UTF_MAX < 4
- int len;
-#endif
+ int result, numChars, len;
Tcl_UniChar ch = 0;
+ flags = TclEncodingSetProfileFlags(flags);
flags |= PTR2INT(clientData);
srcStart = src;
srcEnd = src + srcLen;
@@ -2911,17 +3319,32 @@ UtfToUcs2Proc(
break;
}
#if TCL_UTF_MAX < 4
- src += (len = TclUtfToUniChar(src, &ch));
+ len = TclUtfToUniChar(src, &ch);
if ((ch >= 0xD800) && (len < 3)) {
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ }
+ src += len;
src += TclUtfToUniChar(src, &ch);
- ch = 0xFFFD;
+ ch = UNICODE_REPLACE_CHAR;
}
#else
- src += TclUtfToUniChar(src, &ch);
+ len = TclUtfToUniChar(src, &ch);
if (ch > 0xFFFF) {
- ch = 0xFFFD;
+ if (PROFILE_STRICT(flags)) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ }
+ ch = UNICODE_REPLACE_CHAR;
}
#endif
+ if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+
+ src += len;
/*
* Need to handle this in a way that won't cause misalignment by
@@ -2991,6 +3414,7 @@ TableToUtfProc(
const unsigned short *pageZero;
TableEncodingData *dataPtr = (TableEncodingData *)clientData;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -3014,30 +3438,47 @@ TableToUtfProc(
if (prefixBytes[byte]) {
src++;
if (src >= srcEnd) {
- src--;
- result = TCL_CONVERT_MULTIBYTE;
- break;
+ if (!(flags & TCL_ENCODING_END)) {
+ src--;
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ } else if (PROFILE_STRICT(flags)) {
+ src--;
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ } else if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ src--; /* See bug [bdcb5126c0] */
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ } else {
+ ch = toUnicode[byte][*((unsigned char *)src)];
}
- ch = toUnicode[byte][*((unsigned char *) src)];
} else {
ch = pageZero[byte];
}
if ((ch == 0) && (byte != 0)) {
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
break;
}
if (prefixBytes[byte]) {
src--;
}
- ch = (Tcl_UniChar) byte;
+ if (PROFILE_REPLACE(flags)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ ch = (Tcl_UniChar)byte;
+ }
}
/*
* Special case for 1-byte utf chars for speed.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (char) ch;
} else {
dst += Tcl_UniCharToUtf(ch, dst);
@@ -3107,6 +3548,7 @@ TableFromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3139,11 +3581,11 @@ TableFromUtfProc(
word = fromUnicode[(ch >> 8)][ch & 0xFF];
if ((word == 0) && (ch != 0)) {
- if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
- word = dataPtr->fallback;
+ word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */
}
if (prefixBytes[(word >> 8)] != 0) {
if (dst + 1 > dstEnd) {
@@ -3213,6 +3655,7 @@ Iso88591ToUtfProc(
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -3236,7 +3679,7 @@ Iso88591ToUtfProc(
* Special case for 1-byte utf chars for speed.
*/
- if (ch && ch < 0x80) {
+ if ((unsigned)ch - 1 < 0x7F) {
*dst++ = (char) ch;
} else {
dst += Tcl_UniCharToUtf(ch, dst);
@@ -3297,6 +3740,7 @@ Iso88591FromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3327,7 +3771,7 @@ Iso88591FromUtfProc(
|| ((ch >= 0xD800) && (len < 3))
#endif
) {
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -3340,7 +3784,7 @@ Iso88591FromUtfProc(
* Plunge on, using '?' as a fallback character.
*/
- ch = (Tcl_UniChar) '?';
+ ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */
}
if (dst > dstEnd) {
@@ -3444,6 +3888,7 @@ EscapeToUtfProc(
int state, result, numChars, charLimit = INT_MAX;
const char *dstStart, *dstEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -3554,9 +3999,10 @@ EscapeToUtfProc(
if ((checked == dataPtr->numSubTables + 2)
|| (flags & TCL_ENCODING_END)) {
- if (!STOPONERROR) {
+ if (!PROFILE_STRICT(flags)) {
/*
- * Skip the unknown escape sequence.
+ * Skip the unknown escape sequence. TODO - bug?
+ * May be replace with UNICODE_REPLACE_CHAR?
*/
src += longest;
@@ -3666,6 +4112,7 @@ EscapeFromUtfProc(
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
+ flags = TclEncodingSetProfileFlags(flags);
if ((flags & TCL_ENCODING_END) == 0) {
srcClose -= TCL_UTF_MAX;
}
@@ -3729,7 +4176,7 @@ EscapeFromUtfProc(
if (word == 0) {
state = oldState;
- if (STOPONERROR) {
+ if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -3980,12 +4427,11 @@ unilen4(
static void
InitializeEncodingSearchPath(
char **valuePtr,
- size_t *lengthPtr,
+ TCL_HASH_TYPE *lengthPtr,
Tcl_Encoding *encodingPtr)
{
const char *bytes;
- size_t i, numDirs;
- size_t numBytes;
+ Tcl_Size i, numDirs, numBytes;
Tcl_Obj *libPathObj, *encodingObj, *searchPathObj;
TclNewLiteralStringObj(encodingObj, "encoding");
@@ -4024,6 +4470,166 @@ InitializeEncodingSearchPath(
}
/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingProfileParseName --
+ *
+ * Maps an encoding profile name to its integer equivalent.
+ *
+ * Results:
+ * TCL_OK on success or TCL_ERROR on failure.
+ *
+ * Side effects:
+ * Returns the profile enum value in *profilePtr
+ *
+ *------------------------------------------------------------------------
+ */
+int
+TclEncodingProfileNameToId(
+ Tcl_Interp *interp, /* For error messages. May be NULL */
+ const char *profileName, /* Name of profile */
+ int *profilePtr) /* Output */
+{
+ size_t i;
+ size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]);
+
+ for (i = 0; i < numProfiles; ++i) {
+ if (!strcmp(profileName, encodingProfiles[i].name)) {
+ *profilePtr = encodingProfiles[i].value;
+ return TCL_OK;
+ }
+ }
+ if (interp) {
+ Tcl_Obj *errorObj;
+ /* This code assumes at least two profiles :-) */
+ errorObj =
+ Tcl_ObjPrintf("bad profile name \"%s\": must be",
+ profileName);
+ for (i = 0; i < (numProfiles - 1); ++i) {
+ Tcl_AppendStringsToObj(
+ errorObj, " ", encodingProfiles[i].name, ",", NULL);
+ }
+ Tcl_AppendStringsToObj(
+ errorObj, " or ", encodingProfiles[numProfiles-1].name, NULL);
+
+ Tcl_SetObjResult(interp, errorObj);
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "PROFILE", profileName, NULL);
+ }
+ return TCL_ERROR;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingProfileValueToName --
+ *
+ * Maps an encoding profile value to its name.
+ *
+ * Results:
+ * Pointer to the name or NULL on failure. Caller must not make
+ * not modify the string and must make a copy to hold on to it.
+ *
+ * Side effects:
+ * None.
+ *------------------------------------------------------------------------
+ */
+const char *
+TclEncodingProfileIdToName(
+ Tcl_Interp *interp, /* For error messages. May be NULL */
+ int profileValue) /* Profile #define value */
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) {
+ if (profileValue == encodingProfiles[i].value) {
+ return encodingProfiles[i].name;
+ }
+ }
+ if (interp) {
+ Tcl_SetObjResult(
+ interp,
+ Tcl_ObjPrintf(
+ "Internal error. Bad profile id \"%d\".",
+ profileValue));
+ Tcl_SetErrorCode(
+ interp, "TCL", "ENCODING", "PROFILEID", NULL);
+ }
+ return NULL;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclEncodingSetProfileFlags --
+ *
+ * Maps the flags supported in the encoding C API's to internal flags.
+ *
+ * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is
+ * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile
+ * specified.
+ *
+ * If no profile or an invalid profile is specified, it is set to
+ * the default.
+ *
+ * Results:
+ * Internal encoding flag mask.
+ *
+ * Side effects:
+ * None.
+ *
+ *------------------------------------------------------------------------
+ */
+int TclEncodingSetProfileFlags(int flags)
+{
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT);
+ } else {
+ int profile = CHANNEL_PROFILE_GET(flags);
+ switch (profile) {
+ case TCL_ENCODING_PROFILE_TCL8:
+ case TCL_ENCODING_PROFILE_STRICT:
+ case TCL_ENCODING_PROFILE_REPLACE:
+ break;
+ case 0: /* Unspecified by caller */
+ default:
+ CHANNEL_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT);
+ break;
+ }
+ }
+ return flags;
+}
+
+/*
+ *------------------------------------------------------------------------
+ *
+ * TclGetEncodingProfiles --
+ *
+ * Get the list of supported encoding profiles.
+ *
+ * Results:
+ * None.
+ *
+ * Side effects:
+ * The list of profile names is stored in the interpreter result.
+ *
+ *------------------------------------------------------------------------
+ */
+void
+TclGetEncodingProfiles(Tcl_Interp *interp)
+{
+ size_t i, n;
+ Tcl_Obj *objPtr;
+ n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]);
+ objPtr = Tcl_NewListObj(n, NULL);
+ for (i = 0; i < n; ++i) {
+ Tcl_ListObjAppendElement(
+ interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
+ }
+ Tcl_SetObjResult(interp, objPtr);
+}
+
+/*
* Local Variables:
* mode: c
* c-basic-offset: 4