summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c298
1 files changed, 98 insertions, 200 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 3ef4e58..8a29722 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -220,14 +220,7 @@ static size_t unilen(const char *src);
static Tcl_EncodingConvertProc Utf16ToUtfProc;
static Tcl_EncodingConvertProc UtfToUtf16Proc;
static Tcl_EncodingConvertProc UtfToUcs2Proc;
-static int UtfToUtfProc(ClientData clientData,
- const char *src, int srcLen, int flags,
- Tcl_EncodingState *statePtr, char *dst,
- int dstLen, int *srcReadPtr,
- int *dstWrotePtr, int *dstCharsPtr,
- int pureNullMode);
-static Tcl_EncodingConvertProc UtfIntToUtfExtProc;
-static Tcl_EncodingConvertProc UtfExtToUtfIntProc;
+static Tcl_EncodingConvertProc UtfToUtfProc;
static Tcl_EncodingConvertProc Iso88591FromUtfProc;
static Tcl_EncodingConvertProc Iso88591ToUtfProc;
@@ -517,6 +510,10 @@ FillEncodingFileMap(void)
*---------------------------------------------------------------------------
*/
+/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
+#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */
+#define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */
+
void
TclInitEncodingSubsystem(void)
{
@@ -525,7 +522,7 @@ TclInitEncodingSubsystem(void)
unsigned size;
unsigned short i;
union {
- char c;
+ unsigned char c;
short s;
} isLe;
@@ -533,7 +530,7 @@ TclInitEncodingSubsystem(void)
return;
}
- isLe.s = 1;
+ isLe.s = TCL_ENCODING_LE;
Tcl_MutexLock(&encodingMutex);
Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
Tcl_MutexUnlock(&encodingMutex);
@@ -553,8 +550,8 @@ TclInitEncodingSubsystem(void)
tclIdentityEncoding = Tcl_CreateEncoding(&type);
type.encodingName = "utf-8";
- type.toUtfProc = UtfExtToUtfIntProc;
- type.fromUtfProc = UtfIntToUtfExtProc;
+ type.toUtfProc = UtfToUtfProc;
+ type.fromUtfProc = UtfToUtfProc;
type.freeProc = NULL;
type.nullSize = 1;
type.clientData = NULL;
@@ -565,7 +562,7 @@ TclInitEncodingSubsystem(void)
type.freeProc = NULL;
type.nullSize = 2;
type.encodingName = "ucs-2le";
- type.clientData = INT2PTR(1);
+ type.clientData = INT2PTR(TCL_ENCODING_LE);
Tcl_CreateEncoding(&type);
type.encodingName = "ucs-2be";
type.clientData = INT2PTR(0);
@@ -579,7 +576,7 @@ TclInitEncodingSubsystem(void)
type.freeProc = NULL;
type.nullSize = 2;
type.encodingName = "utf-16le";
- type.clientData = INT2PTR(1);
+ type.clientData = INT2PTR(TCL_ENCODING_LE);
Tcl_CreateEncoding(&type);
type.encodingName = "utf-16be";
type.clientData = INT2PTR(0);
@@ -1077,7 +1074,7 @@ Tcl_ExternalToUtfDString(
srcLen = encodingPtr->lengthProc(src);
}
- flags = TCL_ENCODING_START | TCL_ENCODING_END;
+ flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED;
while (1) {
result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
@@ -1186,25 +1183,24 @@ Tcl_ExternalToUtf(
if (!noTerminate) {
/*
* If there are any null characters in the middle of the buffer,
- * they will converted to the UTF-8 null character (\xC080). To get
+ * they will converted to the UTF-8 null character (\xC0\x80). To get
* the actual \0 at the end of the destination buffer, we need to
* append it manually. First make room for it...
*/
dstLen--;
}
+ flags |= TCL_ENCODING_MODIFIED;
do {
- int savedFlags = flags;
Tcl_EncodingState savedState = *statePtr;
result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
- flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
+ flags , statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
dstCharsPtr);
if (*dstCharsPtr <= maxChars) {
break;
}
dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
- flags = savedFlags;
*statePtr = savedState;
} while (1);
if (!noTerminate) {
@@ -1269,8 +1265,8 @@ Tcl_UtfToExternalDString(
flags = TCL_ENCODING_START | TCL_ENCODING_END;
while (1) {
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
- srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
- &dstChars);
+ srcLen, flags, &state, dst, dstLen,
+ &srcRead, &dstWrote, &dstChars);
soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
if (result != TCL_CONVERT_NOSPACE) {
@@ -1371,8 +1367,8 @@ Tcl_UtfToExternal(
dstLen -= encodingPtr->nullSize;
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
- flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
- dstCharsPtr);
+ flags, statePtr, dst, dstLen, srcReadPtr,
+ dstWrotePtr, dstCharsPtr);
if (encodingPtr->nullSize == 2) {
dst[*dstWrotePtr + 1] = '\0';
}
@@ -2095,104 +2091,6 @@ BinaryProc(
/*
*-------------------------------------------------------------------------
*
- * UtfIntToUtfExtProc --
- *
- * Convert from UTF-8 to UTF-8. While converting null-bytes from the
- * Tcl's internal representation (0xC0, 0x80) to the official
- * representation (0x00). See UtfToUtfProc for details.
- *
- * Results:
- * Returns TCL_OK if conversion was successful.
- *
- * Side effects:
- * None.
- *
- *-------------------------------------------------------------------------
- */
-
-static int
-UtfIntToUtfExtProc(
- ClientData clientData,
- const char *src, /* Source string in UTF-8. */
- int srcLen, /* Source string length in bytes. */
- int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
- char *dst, /* Output buffer in which converted string
- * is stored. */
- int dstLen, /* The maximum length of output buffer in
- * bytes. */
- int *srcReadPtr, /* Filled with the number of bytes from the
- * source string that were converted. This may
- * be less than the original source length if
- * there was a problem converting some source
- * characters. */
- int *dstWrotePtr, /* Filled with the number of bytes that were
- * stored in the output buffer as a result of
- * the conversion. */
- int *dstCharsPtr) /* Filled with the number of characters that
- * correspond to the bytes stored in the
- * output buffer. */
-{
- return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
-}
-
-/*
- *-------------------------------------------------------------------------
- *
- * UtfExtToUtfIntProc --
- *
- * Convert from UTF-8 to UTF-8 while converting null-bytes from the
- * official representation (0x00) to Tcl's internal representation (0xC0,
- * 0x80). See UtfToUtfProc for details.
- *
- * Results:
- * Returns TCL_OK if conversion was successful.
- *
- * Side effects:
- * None.
- *
- *-------------------------------------------------------------------------
- */
-
-static int
-UtfExtToUtfIntProc(
- ClientData clientData,
- const char *src, /* Source string in UTF-8. */
- int srcLen, /* Source string length in bytes. */
- int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
- char *dst, /* Output buffer in which converted string is
- * stored. */
- int dstLen, /* The maximum length of output buffer in
- * bytes. */
- int *srcReadPtr, /* Filled with the number of bytes from the
- * source string that were converted. This may
- * be less than the original source length if
- * there was a problem converting some source
- * characters. */
- int *dstWrotePtr, /* Filled with the number of bytes that were
- * stored in the output buffer as a result of
- * the conversion. */
- int *dstCharsPtr) /* Filled with the number of characters that
- * correspond to the bytes stored in the
- * output buffer. */
-{
- return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
-}
-
-/*
- *-------------------------------------------------------------------------
- *
* UtfToUtfProc --
*
* Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
@@ -2214,11 +2112,7 @@ UtfToUtfProc(
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2231,21 +2125,15 @@ UtfToUtfProc(
int *dstWrotePtr, /* Filled with the number of bytes that were
* stored in the output buffer as a result of
* the conversion. */
- int *dstCharsPtr, /* Filled with the number of characters that
+ int *dstCharsPtr) /* Filled with the number of characters that
* correspond to the bytes stored in the
* output buffer. */
- int pureNullMode) /* Convert embedded nulls from internal
- * representation to real null-bytes or vice
- * versa. Also combine or separate surrogate pairs */
{
const char *srcStart, *srcEnd, *srcClose;
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
- int *chPtr = (int *) statePtr;
+ int ch;
- if (flags & TCL_ENCODING_START) {
- *statePtr = 0;
- }
result = TCL_OK;
srcStart = src;
@@ -2262,7 +2150,7 @@ UtfToUtfProc(
dstEnd = dst + dstLen - TCL_UTF_MAX;
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
- if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
+ if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
/*
* If there is more string to follow, this will ensure that the
* last UTF-8 character in the source buffer hasn't been cut off.
@@ -2275,48 +2163,62 @@ UtfToUtfProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+ if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && (flags & TCL_ENCODING_MODIFIED))) {
/*
* Copy 7bit characters, but skip null-bytes when we are in input
* mode, so that they get converted to 0xC080.
*/
*dst++ = *src++;
- } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 &&
- (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) {
+ } else if (UCHAR(*src) == 0xC0 && (src + 1 < srcEnd)
+ && UCHAR(src[1]) == 0x80 && !(flags & TCL_ENCODING_MODIFIED)) {
/*
* Convert 0xC080 to real nulls when we are in output mode.
*/
*dst++ = 0;
src += 2;
- } else if (!TclUCS4Complete(src, srcEnd - src)) {
+ } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
* Always check before using TclUtfToUCS4. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
- * incomplete char its bytes are made to represent themselves.
+ * incomplete char its bytes are made to represent themselves
+ * unless the user has explicitly asked to be told.
*/
- *chPtr = UCHAR(*src);
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ ch = UCHAR(*src);
src += 1;
- dst += Tcl_UniCharToUtf(*chPtr, dst);
+ dst += Tcl_UniCharToUtf(ch, dst);
} else {
- src += TclUtfToUCS4(src, chPtr);
- if ((*chPtr | 0x7FF) == 0xDFFF) {
- /* A surrogate character is detected, handle especially */
- int low = *chPtr;
- size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
- if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
- *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
- *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
- *dst++ = (char) ((*chPtr | 0x80) & 0xBF);
- continue;
+ size_t len = TclUtfToUCS4(src, &ch);
+ if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+ src += len;
+ if ((ch | 0x7FF) == 0xDFFF) {
+ /*
+ * A surrogate character is detected, handle especially.
+ */
+
+ int low = ch;
+ len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
+
+ if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
+ *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
+ *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
+ *dst++ = (char) ((ch | 0x80) & 0xBF);
+ continue;
}
src += len;
- dst += Tcl_UniCharToUtf(*chPtr, dst);
- *chPtr = low;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ ch = low;
}
- dst += Tcl_UniCharToUtf(*chPtr, dst);
+ dst += Tcl_UniCharToUtf(ch, dst);
}
}
@@ -2344,7 +2246,7 @@ UtfToUtfProc(
static int
Utf16ToUtfProc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+ ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in Unicode. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2370,18 +2272,27 @@ Utf16ToUtfProc(
int result, numChars, charLimit = INT_MAX;
unsigned short ch;
+ flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
result = TCL_OK;
- /* check alignment with utf-16 (2 == sizeof(UTF-16)) */
+ /*
+ * Check alignment with utf-16 (2 == sizeof(UTF-16))
+ */
+
if ((srcLen % 2) != 0) {
result = TCL_CONVERT_MULTIBYTE;
srcLen--;
}
- /* If last code point is a high surrogate, we cannot handle that yet */
- if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) {
+
+ /*
+ * If last code point is a high surrogate, we cannot handle that yet.
+ */
+
+ if ((srcLen >= 2) &&
+ ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
result = TCL_CONVERT_MULTIBYTE;
srcLen-= 2;
}
@@ -2398,15 +2309,17 @@ Utf16ToUtfProc(
break;
}
- if (clientData) {
+ if (flags & TCL_ENCODING_LE) {
ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
}
+
/*
* Special case for 1-byte utf chars for speed. Make sure we work with
* unsigned short-size data.
*/
+
if (ch && ch < 0x80) {
*dst++ = (ch & 0xFF);
} else {
@@ -2443,11 +2356,7 @@ UtfToUtf16Proc(
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2466,11 +2375,8 @@ UtfToUtf16Proc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ int ch;
- if (flags & TCL_ENCODING_START) {
- *statePtr = 0;
- }
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
@@ -2496,38 +2402,27 @@ UtfToUtf16Proc(
result = TCL_CONVERT_NOSPACE;
break;
}
- src += TclUtfToUniChar(src, chPtr);
-
+ src += TclUtfToUCS4(src, &ch);
if (clientData) {
-#if TCL_UTF_MAX > 3
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
+ if (ch <= 0xFFFF) {
+ *dst++ = (ch & 0xFF);
+ *dst++ = (ch >> 8);
} else {
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (*chPtr & 0xFF);
- *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
+ *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+ *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (ch & 0xFF);
+ *dst++ = ((ch >> 8) & 0x3) | 0xDC;
}
-#else
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
-#endif
} else {
-#if TCL_UTF_MAX > 3
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
+ if (ch <= 0xFFFF) {
+ *dst++ = (ch >> 8);
+ *dst++ = (ch & 0xFF);
} else {
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
- *dst++ = (*chPtr & 0xFF);
+ *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+ *dst++ = ((ch >> 8) & 0x3) | 0xDC;
+ *dst++ = (ch & 0xFF);
}
-#else
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
-#endif
}
}
*srcReadPtr = src - srcStart;
@@ -2554,7 +2449,7 @@ UtfToUtf16Proc(
static int
UtfToUcs2Proc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+ ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2582,6 +2477,7 @@ UtfToUcs2Proc(
#endif
Tcl_UniChar ch = 0;
+ flags |= PTR2INT(clientData);
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
@@ -2625,7 +2521,7 @@ UtfToUcs2Proc(
* casting dst to a Tcl_UniChar. [Bug 1122671]
*/
- if (clientData) {
+ if (flags & TCL_ENCODING_LE) {
*dst++ = (ch & 0xFF);
*dst++ = (ch >> 8);
} else {
@@ -3033,7 +2929,9 @@ Iso88591FromUtfProc(
break;
}
#if TCL_UTF_MAX <= 3
- if ((ch >= 0xD800) && (len < 3)) len = 4;
+ if ((ch >= 0xD800) && (len < 3)) {
+ len = 4;
+ }
#endif
/*
* Plunge on, using '?' as a fallback character.
@@ -3078,7 +2976,7 @@ TableFreeProc(
ClientData clientData) /* TableEncodingData that specifies
* encoding. */
{
- TableEncodingData *dataPtr = (TableEncodingData *)clientData;
+ TableEncodingData *dataPtr = (TableEncodingData *) clientData;
/*
* Make sure we aren't freeing twice on shutdown. [Bug 219314]
@@ -3136,7 +3034,7 @@ EscapeToUtfProc(
* correspond to the bytes stored in the
* output buffer. */
{
- EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
+ EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData;
const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd;
const unsigned short *const *tableToUnicode;
const Encoding *encodingPtr;