summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c83
1 files changed, 71 insertions, 12 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 0bce51b..e68aa25 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -512,6 +512,8 @@ FillEncodingFileMap(void)
/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
#define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */
+#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */
+#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
void
TclInitEncodingSubsystem(void)
@@ -553,7 +555,13 @@ TclInitEncodingSubsystem(void)
type.fromUtfProc = UtfToUtfProc;
type.freeProc = NULL;
type.nullSize = 1;
- type.clientData = NULL;
+ type.clientData = INT2PTR(TCL_ENCODING_UTF);
+ Tcl_CreateEncoding(&type);
+ type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF);
+ type.encodingName = "wtf-8";
+ Tcl_CreateEncoding(&type);
+ type.clientData = INT2PTR(0);
+ type.encodingName = "cesu-8";
Tcl_CreateEncoding(&type);
type.toUtfProc = Utf16ToUtfProc;
@@ -577,12 +585,21 @@ TclInitEncodingSubsystem(void)
type.encodingName = "utf-16le";
type.clientData = INT2PTR(TCL_ENCODING_LE);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16le";
+ type.clientData = INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF);
+ Tcl_CreateEncoding(&type);
type.encodingName = "utf-16be";
type.clientData = INT2PTR(0);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16be";
+ type.clientData = INT2PTR(TCL_ENCODING_WTF);
+ Tcl_CreateEncoding(&type);
type.encodingName = "utf-16";
type.clientData = INT2PTR(isLe.c);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16";
+ type.clientData = INT2PTR(isLe.c + TCL_ENCODING_WTF);
+ Tcl_CreateEncoding(&type);
#ifndef TCL_NO_DEPRECATED
type.encodingName = "unicode";
@@ -1153,7 +1170,7 @@ Tcl_ExternalToUtfDStringEx(
srcLen = encodingPtr->lengthProc(src);
}
- flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED;
+ flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
while (1) {
result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
@@ -1268,12 +1285,12 @@ Tcl_ExternalToUtf(
dstLen--;
}
- flags |= TCL_ENCODING_MODIFIED;
+ flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
do {
Tcl_EncodingState savedState = *statePtr;
result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
- flags , statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
+ flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
dstCharsPtr);
if (*dstCharsPtr <= maxChars) {
break;
@@ -2200,7 +2217,7 @@ BinaryProc(
static int
UtfToUtfProc(
- TCL_UNUSED(ClientData),
+ ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2239,7 +2256,8 @@ UtfToUtfProc(
}
dstStart = dst;
- dstEnd = dst + dstLen - TCL_UTF_MAX;
+ flags |= PTR2INT(clientData);
+ dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -2286,21 +2304,44 @@ UtfToUtfProc(
src += 1;
dst += Tcl_UniCharToUtf(ch, dst);
} else {
+ int low;
+ const char *saveSrc = src;
size_t len = TclUtfToUCS4(src, &ch);
if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) {
result = TCL_CONVERT_SYNTAX;
break;
}
src += len;
- if ((ch | 0x7FF) == 0xDFFF) {
+ if (!(flags & TCL_ENCODING_UTF)) {
+ if (ch > 0xFFFF) {
+ /* CESU-8 6-byte sequence for chars > U+FFFF */
+ ch -= 0x10000;
+ *dst++ = 0xED;
+ *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
+ *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
+ ch = (ch & 0x0CFF) | 0xDC00;
+ }
+ goto cesu8;
+ } else if ((ch | 0x7FF) == 0xDFFF) {
/*
* A surrogate character is detected, handle especially.
*/
- int low = ch;
+ low = ch;
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
+ if (!(flags & TCL_ENCODING_WTF)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (!(flags & TCL_ENCODING_MODIFIED)) {
+ ch = 0xFFFD;
+ }
+ }
+ cesu8:
*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
*dst++ = (char) ((ch | 0x80) & 0xBF);
@@ -2309,6 +2350,15 @@ UtfToUtfProc(
src += len;
dst += Tcl_UniCharToUtf(ch, dst);
ch = low;
+ } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (!(flags & TCL_ENCODING_MODIFIED)) {
+ ch = 0xFFFD;
+ }
}
dst += Tcl_UniCharToUtf(ch, dst);
}
@@ -2444,7 +2494,7 @@ Utf16ToUtfProc(
static int
UtfToUtf16Proc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+ ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2467,7 +2517,7 @@ UtfToUtf16Proc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- int ch;
+ int ch, len;
srcStart = src;
srcEnd = src + srcLen;
@@ -2478,6 +2528,7 @@ UtfToUtf16Proc(
dstStart = dst;
dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
+ flags |= PTR2INT(clientData);
result = TCL_OK;
for (numChars = 0; src < srcEnd; numChars++) {
@@ -2494,8 +2545,16 @@ UtfToUtf16Proc(
result = TCL_CONVERT_NOSPACE;
break;
}
- src += TclUtfToUCS4(src, &ch);
- if (clientData) {
+ len = TclUtfToUCS4(src, &ch);
+ if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ }
+ ch = 0xFFFD;
+ }
+ src += len;
+ if (flags & TCL_ENCODING_LE) {
if (ch <= 0xFFFF) {
*dst++ = (ch & 0xFF);
*dst++ = (ch >> 8);