summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclEncoding.c226
-rw-r--r--tests/binary.test2
-rw-r--r--tests/chanio.test4
-rw-r--r--tests/encoding.test23
-rw-r--r--tests/io.test4
-rw-r--r--tests/ioCmd.test8
-rw-r--r--tests/source.test4
7 files changed, 213 insertions, 58 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 7eb73e8..a88c1a7 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -234,12 +234,17 @@ static int TableToUtfProc(ClientData clientData, const char *src,
char *dst, int dstLen, int *srcReadPtr,
int *dstWrotePtr, int *dstCharsPtr);
static size_t unilen(const char *src);
-static int UniCharToUtfProc(ClientData clientData,
+static int Utf16ToUtfProc(ClientData clientData,
const char *src, int srcLen, int flags,
Tcl_EncodingState *statePtr, char *dst, int dstLen,
int *srcReadPtr, int *dstWrotePtr,
int *dstCharsPtr);
-static int UtfToUniCharProc(ClientData clientData,
+static int UtfToUtf16Proc(ClientData clientData,
+ const char *src, int srcLen, int flags,
+ Tcl_EncodingState *statePtr, char *dst, int dstLen,
+ int *srcReadPtr, int *dstWrotePtr,
+ int *dstCharsPtr);
+static int UtfToUcs2Proc(ClientData clientData,
const char *src, int srcLen, int flags,
Tcl_EncodingState *statePtr, char *dst, int dstLen,
int *srcReadPtr, int *dstWrotePtr,
@@ -564,11 +569,16 @@ TclInitEncodingSubsystem(void)
TableEncodingData *dataPtr;
unsigned size;
unsigned short i;
+ union {
+ char c;
+ short s;
+ } isLe;
if (encodingsInitialized) {
return;
}
+ isLe.s = 1;
Tcl_MutexLock(&encodingMutex);
Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
Tcl_MutexUnlock(&encodingMutex);
@@ -595,13 +605,38 @@ TclInitEncodingSubsystem(void)
type.clientData = NULL;
Tcl_CreateEncoding(&type);
- type.encodingName = "unicode";
- type.toUtfProc = UniCharToUtfProc;
- type.fromUtfProc = UtfToUniCharProc;
+ type.toUtfProc = Utf16ToUtfProc;
+ type.fromUtfProc = UtfToUcs2Proc;
type.freeProc = NULL;
type.nullSize = 2;
- type.clientData = NULL;
+ type.encodingName = "ucs-2le";
+ type.clientData = INT2PTR(1);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "ucs-2be";
+ type.clientData = INT2PTR(0);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "ucs-2";
+ type.clientData = INT2PTR(isLe.c);
+ Tcl_CreateEncoding(&type);
+
+ type.toUtfProc = Utf16ToUtfProc;
+ type.fromUtfProc = UtfToUtf16Proc;
+ type.freeProc = NULL;
+ type.nullSize = 2;
+ type.encodingName = "utf-16le";
+ type.clientData = INT2PTR(1);;
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-16be";
+ type.clientData = INT2PTR(0);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-16";
+ type.clientData = INT2PTR(isLe.c);;
+ Tcl_CreateEncoding(&type);
+
+#ifndef TCL_NO_DEPRECATED
+ type.encodingName = "unicode";
Tcl_CreateEncoding(&type);
+#endif
/*
* Need the iso8859-1 encoding in order to process binary data, so force
@@ -1279,7 +1314,7 @@ Tcl_ExternalToUtf(
if (*dstCharsPtr <= maxChars) {
break;
}
- dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX;
+ dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
flags = savedFlags;
*statePtr = savedState;
} while (1);
@@ -2401,9 +2436,9 @@ UtfToUtfProc(
/*
*-------------------------------------------------------------------------
*
- * UniCharToUtfProc --
+ * Utf16ToUtfProc --
*
- * Convert from Unicode to UTF-8.
+ * Convert from UTF-16 to UTF-8.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2415,8 +2450,8 @@ UtfToUtfProc(
*/
static int
-UniCharToUtfProc(
- ClientData clientData, /* Not used. */
+Utf16ToUtfProc(
+ ClientData clientData, /* != NULL means LE, == NUL means BE */
const char *src, /* Source string in Unicode. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2468,12 +2503,15 @@ UniCharToUtfProc(
break;
}
+ if (clientData) {
+ ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
+ } else {
+ ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
+ }
/*
* Special case for 1-byte utf chars for speed. Make sure we work with
* unsigned short-size data.
*/
-
- ch = *(unsigned short *)src;
if (ch && ch < 0x80) {
*dst++ = (ch & 0xFF);
} else {
@@ -2491,9 +2529,9 @@ UniCharToUtfProc(
/*
*-------------------------------------------------------------------------
*
- * UtfToUniCharProc --
+ * UtfToUtf16Proc --
*
- * Convert from UTF-8 to Unicode.
+ * Convert from UTF-8 to UTF-16.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2505,9 +2543,8 @@ UniCharToUtfProc(
*/
static int
-UtfToUniCharProc(
- ClientData clientData, /* TableEncodingData that specifies
- * encoding. */
+UtfToUtf16Proc(
+ ClientData clientData, /* != NULL means LE, == NUL means BE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2571,44 +2608,151 @@ UtfToUniCharProc(
* casting dst to a Tcl_UniChar. [Bug 1122671]
*/
-#ifdef WORDS_BIGENDIAN
+ if (clientData) {
#if TCL_UTF_MAX > 4
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
- } else {
- *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- }
-#else
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
-#endif
+ if (*chPtr <= 0xFFFF) {
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = (*chPtr >> 8);
+ } else {
+ *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
+ *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+ }
#else
-#if TCL_UTF_MAX > 4
- if (*chPtr <= 0xFFFF) {
*dst++ = (*chPtr & 0xFF);
*dst++ = (*chPtr >> 8);
+#endif
} else {
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+#if TCL_UTF_MAX > 4
+ if (*chPtr <= 0xFFFF) {
+ *dst++ = (*chPtr >> 8);
+ *dst++ = (*chPtr & 0xFF);
+ } else {
+ *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
+ }
+#else
+ *dst++ = (*chPtr >> 8);
*dst++ = (*chPtr & 0xFF);
- *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+#endif
}
-#else
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
+ }
+ *srcReadPtr = src - srcStart;
+ *dstWrotePtr = dst - dstStart;
+ *dstCharsPtr = numChars;
+ return result;
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfToUcs2Proc --
+ *
+ * Convert from UTF-8 to UCS-2.
+ *
+ * Results:
+ * Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static int
+UtfToUcs2Proc(
+ ClientData clientData, /* != NULL means LE, == NUL means BE */
+ const char *src, /* Source string in UTF-8. */
+ int srcLen, /* Source string length in bytes. */
+ int flags, /* Conversion control flags. */
+ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
+ * information used during a piecewise
+ * conversion. Contents of statePtr are
+ * initialized and/or reset by conversion
+ * routine under control of flags argument. */
+ char *dst, /* Output buffer in which converted string is
+ * stored. */
+ int dstLen, /* The maximum length of output buffer in
+ * bytes. */
+ int *srcReadPtr, /* Filled with the number of bytes from the
+ * source string that were converted. This may
+ * be less than the original source length if
+ * there was a problem converting some source
+ * characters. */
+ int *dstWrotePtr, /* Filled with the number of bytes that were
+ * stored in the output buffer as a result of
+ * the conversion. */
+ int *dstCharsPtr) /* Filled with the number of characters that
+ * correspond to the bytes stored in the
+ * output buffer. */
+{
+ const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
+ int result, numChars;
+#if TCL_UTF_MAX <= 4
+ int len;
#endif
+ Tcl_UniChar ch = 0;
+
+ srcStart = src;
+ srcEnd = src + srcLen;
+ srcClose = srcEnd;
+ if ((flags & TCL_ENCODING_END) == 0) {
+ srcClose -= TCL_UTF_MAX;
+ }
+
+ dstStart = dst;
+ dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
+
+ result = TCL_OK;
+ for (numChars = 0; src < srcEnd; numChars++) {
+ if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+ /*
+ * If there is more string to follow, this will ensure that the
+ * last UTF-8 character in the source buffer hasn't been cut off.
+ */
+
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ break;
+ }
+#if TCL_UTF_MAX <= 4
+ src += (len = TclUtfToUniChar(src, &ch));
+ if ((ch >= 0xD800) && (len < 3)) {
+ src += TclUtfToUniChar(src, &ch);
+ ch = 0xFFFD;
+ }
+#else
+ src += TclUtfToUniChar(src, &ch);
+ if (ch > 0xFFFF) {
+ ch = 0xFFFD;
+ }
#endif
+
+ /*
+ * Need to handle this in a way that won't cause misalignment by
+ * casting dst to a Tcl_UniChar. [Bug 1122671]
+ */
+
+ if (clientData) {
+ *dst++ = (ch & 0xFF);
+ *dst++ = (ch >> 8);
+ } else {
+ *dst++ = (ch >> 8);
+ *dst++ = (ch & 0xFF);
+ }
}
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
return result;
}
-
+
/*
*-------------------------------------------------------------------------
*
diff --git a/tests/binary.test b/tests/binary.test
index aede659..973240f 100644
--- a/tests/binary.test
+++ b/tests/binary.test
@@ -2912,7 +2912,7 @@ test binary-77.2 {string cat ops on all bytearrays} {
} [binary format H* abcd]
test binary-78.1 {unicode (out of BMP) to byte-array conversion, bug-[bd94500678]} -body {
- # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX <= 4):
+ # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX == 3):
binary encode hex \U0001f415
binary scan \U0001f415 a* v; set v
set str {}
diff --git a/tests/chanio.test b/tests/chanio.test
index 9dc9e7c..1439fe4 100644
--- a/tests/chanio.test
+++ b/tests/chanio.test
@@ -888,7 +888,7 @@ test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} -setup
# Tcl_ExternalToUtf()
set f [openpipe w+ $path(cat)]
chan configure $f -translation {auto lf} -buffering none
- chan configure $f -encoding unicode
+ chan configure $f -encoding utf-16
chan puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
chan configure $f -buffersize 16
chan gets $f
@@ -1129,7 +1129,7 @@ test chan-io-8.2 {PeekAhead: only go to device if no more cached data} -setup {
chan event $f read [namespace code {
lappend x [chan gets $f line] $line [testchannel inputbuffered $f]
}]
- chan configure $f -encoding unicode -buffersize 16 -blocking 0
+ chan configure $f -encoding utf-16 -buffersize 16 -blocking 0
vwait [namespace which -variable x]
chan configure $f -translation auto -encoding ascii -blocking 1
# here
diff --git a/tests/encoding.test b/tests/encoding.test
index 4736928..da34f03 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -322,18 +322,29 @@ test encoding-15.3 {UtfToUtfProc null character input} teststringbytes {
set z
} c080
-test encoding-16.1 {UnicodeToUtfProc} -body {
- set val [encoding convertfrom unicode NN]
+test encoding-16.1 {Utf16ToUtfProc} -body {
+ set val [encoding convertfrom utf-16 NN]
list $val [format %x [scan $val %c]]
} -result "\u4e4e 4e4e"
-test encoding-16.2 {UnicodeToUtfProc} -body {
- set val [encoding convertfrom unicode "\xd8\xd8\xdc\xdc"]
+test encoding-16.2 {Utf16ToUtfProc} -body {
+ set val [encoding convertfrom utf-16 "\xd8\xd8\xdc\xdc"]
+ list $val [format %x [scan $val %c]]
+} -result "\U460dc 460dc"
+test encoding-16.3 {Ucs2ToUtfProc} -body {
+ set val [encoding convertfrom ucs-2 NN]
+ list $val [format %x [scan $val %c]]
+} -result "\u4e4e 4e4e"
+test encoding-16.4 {Ucs2ToUtfProc} -body {
+ set val [encoding convertfrom ucs-2 "\xd8\xd8\xdc\xdc"]
list $val [format %x [scan $val %c]]
} -result "\U460dc 460dc"
-test encoding-17.1 {UtfToUnicodeProc} -body {
- encoding convertto unicode "\U460dc"
+test encoding-17.1 {UtfToUtf16Proc} -body {
+ encoding convertto utf-16 "\U460dc"
} -result "\xd8\xd8\xdc\xdc"
+test encoding-17.2 {UtfToUcs2Proc} -body {
+ encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460dc"]
+} -result "\ufffd"
test encoding-18.1 {TableToUtfProc} {
} {}
diff --git a/tests/io.test b/tests/io.test
index 6470282..39deab6 100644
--- a/tests/io.test
+++ b/tests/io.test
@@ -918,7 +918,7 @@ test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio test
set f [open "|[list [interpreter] $path(cat)]" w+]
fconfigure $f -translation {auto lf} -buffering none
- fconfigure $f -encoding unicode
+ fconfigure $f -encoding utf-16
puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
fconfigure $f -buffersize 16
gets $f
@@ -1162,7 +1162,7 @@ test io-8.2 {PeekAhead: only go to device if no more cached data} {stdio testcha
variable x
lappend x [gets $f line] $line [testchannel inputbuffered $f]
}
- fconfigure $f -encoding unicode -buffersize 16 -blocking 0
+ fconfigure $f -encoding utf-16 -buffersize 16 -blocking 0
vwait [namespace which -variable x]
fconfigure $f -translation auto -encoding ascii -blocking 1
# here
diff --git a/tests/ioCmd.test b/tests/ioCmd.test
index a967139..87ad4af 100644
--- a/tests/ioCmd.test
+++ b/tests/ioCmd.test
@@ -241,23 +241,23 @@ test iocmd-8.7 {fconfigure command} -setup {
file delete $path(test1)
} -body {
set f1 [open $path(test1) w]
- fconfigure $f1 -translation lf -eofchar {} -encoding unicode
+ fconfigure $f1 -translation lf -eofchar {} -encoding utf-16
fconfigure $f1
} -cleanup {
catch {close $f1}
-} -result {-blocking 1 -buffering full -buffersize 4096 -encoding unicode -eofchar {} -translation lf}
+} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -eofchar {} -translation lf}
test iocmd-8.8 {fconfigure command} -setup {
file delete $path(test1)
set x {}
} -body {
set f1 [open $path(test1) w]
fconfigure $f1 -translation lf -buffering line -buffersize 3030 \
- -eofchar {} -encoding unicode
+ -eofchar {} -encoding utf-16
lappend x [fconfigure $f1 -buffering]
lappend x [fconfigure $f1]
} -cleanup {
catch {close $f1}
-} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding unicode -eofchar {} -translation lf}}
+} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -eofchar {} -translation lf}}
test iocmd-8.9 {fconfigure command} -setup {
file delete $path(test1)
} -body {
diff --git a/tests/source.test b/tests/source.test
index 8b146d3..c6cccd6 100644
--- a/tests/source.test
+++ b/tests/source.test
@@ -240,12 +240,12 @@ test source-7.2 {source -encoding test} -setup {
set sourcefile [makeFile {} source.file]
file delete $sourcefile
set f [open $sourcefile w]
- fconfigure $f -encoding unicode
+ fconfigure $f -encoding utf-16
puts $f "set symbol(square-root) \u221A; set x correct"
close $f
} -body {
set x unset
- source -encoding unicode $sourcefile
+ source -encoding utf-16 $sourcefile
set x
} -cleanup {
removeFile source.file