summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2021-03-17 12:46:31 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2021-03-17 12:46:31 (GMT)
commit74f570670c3ca45a4ff87a051d9ecdadb396dc56 (patch)
treed4dffcd7bdf37dcc8542df142df27ad4b5e40d69 /generic/tclEncoding.c
parent9d25ea9cd7a4e93d8870ddcb15b4677e5d908e2e (diff)
downloadtcl-74f570670c3ca45a4ff87a051d9ecdadb396dc56.zip
tcl-74f570670c3ca45a4ff87a051d9ecdadb396dc56.tar.gz
tcl-74f570670c3ca45a4ff87a051d9ecdadb396dc56.tar.bz2
Add wtf-16 encodings to the set. With testcases
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c119
1 files changed, 53 insertions, 66 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 2198c33..2707972 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -587,12 +587,21 @@ TclInitEncodingSubsystem(void)
type.encodingName = "utf-16le";
type.clientData = INT2PTR(FLAG_LE);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16le";
+ type.clientData = INT2PTR(FLAG_LE + FLAG_WTF);
+ Tcl_CreateEncoding(&type);
type.encodingName = "utf-16be";
type.clientData = INT2PTR(0);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16be";
+ type.clientData = INT2PTR(FLAG_WTF);
+ Tcl_CreateEncoding(&type);
type.encodingName = "utf-16";
type.clientData = INT2PTR(isLe.c);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16";
+ type.clientData = INT2PTR(isLe.c + FLAG_WTF);
+ Tcl_CreateEncoding(&type);
#ifndef TCL_NO_DEPRECATED
type.encodingName = "unicode";
@@ -2281,11 +2290,7 @@ UtfToUtfProc(
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2309,11 +2314,8 @@ UtfToUtfProc(
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
int encflags = PTR2INT(clientData);
- int *chPtr = (int *) statePtr;
+ int ch;
- if (flags & TCL_ENCODING_START) {
- *statePtr = 0;
- }
result = TCL_OK;
srcStart = src;
@@ -2350,7 +2352,6 @@ UtfToUtfProc(
*/
*dst++ = *src++;
- *chPtr = 0; /* reset surrogate handling */
} else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 &&
(src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) {
/*
@@ -2358,7 +2359,6 @@ UtfToUtfProc(
*/
*dst++ = 0;
- *chPtr = 0; /* reset surrogate handling */
src += 2;
} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
@@ -2367,32 +2367,32 @@ UtfToUtfProc(
* incomplete char its bytes are made to represent themselves.
*/
- *chPtr = UCHAR(*src);
+ ch = UCHAR(*src);
src += 1;
- dst += Tcl_UniCharToUtf(*chPtr, dst);
+ dst += Tcl_UniCharToUtf(ch, dst);
} else {
- src += TclUtfToUCS4(src, chPtr);
- if ((*chPtr | 0x7FF) == 0xDFFF) {
+ src += TclUtfToUCS4(src, &ch);
+ if ((ch | 0x7FF) == 0xDFFF) {
/* A surrogate character is detected, handle especially */
- int low = *chPtr;
+ int low = ch;
size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
- if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
+ if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
if ((pureNullMode == 1) && !(encflags & FLAG_WTF)) {
- *chPtr = 0xFFFD;
+ ch = 0xFFFD;
}
- *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
- *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
- *dst++ = (char) ((*chPtr | 0x80) & 0xBF);
+ *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
+ *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
+ *dst++ = (char) ((ch | 0x80) & 0xBF);
continue;
}
src += len;
- dst += Tcl_UniCharToUtf(*chPtr, dst);
- *chPtr = low;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ ch = low;
} else if ((pureNullMode == 1) && !(encflags & FLAG_WTF)
- && !Tcl_UniCharIsUnicode(*chPtr)) {
- *chPtr = 0xFFFD;
+ && !Tcl_UniCharIsUnicode(ch)) {
+ ch = 0xFFFD;
}
- dst += Tcl_UniCharToUtf(*chPtr, dst);
+ dst += Tcl_UniCharToUtf(ch, dst);
}
}
@@ -2420,7 +2420,7 @@ UtfToUtfProc(
static int
Utf16ToUtfProc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+ ClientData clientData, /* flags */
const char *src, /* Source string in Unicode. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2444,6 +2444,7 @@ Utf16ToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
+ int encflags = PTR2INT(clientData);
unsigned short ch;
if (flags & TCL_ENCODING_CHAR_LIMIT) {
@@ -2457,7 +2458,7 @@ Utf16ToUtfProc(
srcLen--;
}
/* If last code point is a high surrogate, we cannot handle that yet */
- if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) {
+ if ((srcLen >= 2) && ((src[srcLen - ((encflags & FLAG_LE)?1:2)] & 0xFC) == 0xD8)) {
result = TCL_CONVERT_MULTIBYTE;
srcLen-= 2;
}
@@ -2474,7 +2475,7 @@ Utf16ToUtfProc(
break;
}
- if (clientData) {
+ if (encflags & FLAG_LE) {
ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
@@ -2515,15 +2516,11 @@ Utf16ToUtfProc(
static int
UtfToUtf16Proc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+ ClientData clientData1, /* flags */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
- Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
- * information used during a piecewise
- * conversion. Contents of statePtr are
- * initialized and/or reset by conversion
- * routine under control of flags argument. */
+ TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2542,11 +2539,9 @@ UtfToUtf16Proc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ int encflags = PTR2INT(clientData1);
+ int ch;
- if (flags & TCL_ENCODING_START) {
- *statePtr = 0;
- }
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
@@ -2572,38 +2567,30 @@ UtfToUtf16Proc(
result = TCL_CONVERT_NOSPACE;
break;
}
- src += TclUtfToUniChar(src, chPtr);
-
- if (clientData) {
-#if TCL_UTF_MAX > 3
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
+ src += TclUtfToUCS4(src, &ch);
+ if (!(encflags & FLAG_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+ ch = 0xFFFD;
+ }
+ if (encflags & FLAG_LE) {
+ if (ch <= 0xFFFF) {
+ *dst++ = (ch & 0xFF);
+ *dst++ = (ch >> 8);
} else {
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (*chPtr & 0xFF);
- *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
+ *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+ *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (ch & 0xFF);
+ *dst++ = ((ch >> 8) & 0x3) | 0xDC;
}
-#else
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
-#endif
} else {
-#if TCL_UTF_MAX > 3
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
+ if (ch <= 0xFFFF) {
+ *dst++ = (ch >> 8);
+ *dst++ = (ch & 0xFF);
} else {
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
- *dst++ = (*chPtr & 0xFF);
+ *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+ *dst++ = ((ch >> 8) & 0x3) | 0xDC;
+ *dst++ = (ch & 0xFF);
}
-#else
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
-#endif
}
}
*srcReadPtr = src - srcStart;