summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2019-06-28 22:36:12 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2019-06-28 22:36:12 (GMT)
commit745219e06c4a987b8ecd3ca4aa9551bc1a10bb4b (patch)
tree81aff5156241ba2e4530634c8a2b0bb11ed2f9b0 /generic
parenta4d19fb0850b5a941a9689e852704e38c54fa84d (diff)
parenta3e27be7687efc74cd599d893a2f2e5c71cf5482 (diff)
downloadtcl-745219e06c4a987b8ecd3ca4aa9551bc1a10bb4b.zip
tcl-745219e06c4a987b8ecd3ca4aa9551bc1a10bb4b.tar.gz
tcl-745219e06c4a987b8ecd3ca4aa9551bc1a10bb4b.tar.bz2
Implement TIP #547: New encodings: UTF-16, UCS-2
Diffstat (limited to 'generic')
-rw-r--r--generic/tclEncoding.c226
1 files changed, 185 insertions, 41 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 7eb73e8..a88c1a7 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -234,12 +234,17 @@ static int TableToUtfProc(ClientData clientData, const char *src,
char *dst, int dstLen, int *srcReadPtr,
int *dstWrotePtr, int *dstCharsPtr);
static size_t unilen(const char *src);
-static int UniCharToUtfProc(ClientData clientData,
+static int Utf16ToUtfProc(ClientData clientData,
const char *src, int srcLen, int flags,
Tcl_EncodingState *statePtr, char *dst, int dstLen,
int *srcReadPtr, int *dstWrotePtr,
int *dstCharsPtr);
-static int UtfToUniCharProc(ClientData clientData,
+static int UtfToUtf16Proc(ClientData clientData,
+ const char *src, int srcLen, int flags,
+ Tcl_EncodingState *statePtr, char *dst, int dstLen,
+ int *srcReadPtr, int *dstWrotePtr,
+ int *dstCharsPtr);
+static int UtfToUcs2Proc(ClientData clientData,
const char *src, int srcLen, int flags,
Tcl_EncodingState *statePtr, char *dst, int dstLen,
int *srcReadPtr, int *dstWrotePtr,
@@ -564,11 +569,16 @@ TclInitEncodingSubsystem(void)
TableEncodingData *dataPtr;
unsigned size;
unsigned short i;
+ union {
+ char c;
+ short s;
+ } isLe;
if (encodingsInitialized) {
return;
}
+ isLe.s = 1;
Tcl_MutexLock(&encodingMutex);
Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
Tcl_MutexUnlock(&encodingMutex);
@@ -595,13 +605,38 @@ TclInitEncodingSubsystem(void)
type.clientData = NULL;
Tcl_CreateEncoding(&type);
- type.encodingName = "unicode";
- type.toUtfProc = UniCharToUtfProc;
- type.fromUtfProc = UtfToUniCharProc;
+ type.toUtfProc = Utf16ToUtfProc;
+ type.fromUtfProc = UtfToUcs2Proc;
type.freeProc = NULL;
type.nullSize = 2;
- type.clientData = NULL;
+ type.encodingName = "ucs-2le";
+ type.clientData = INT2PTR(1);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "ucs-2be";
+ type.clientData = INT2PTR(0);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "ucs-2";
+ type.clientData = INT2PTR(isLe.c);
+ Tcl_CreateEncoding(&type);
+
+ type.toUtfProc = Utf16ToUtfProc;
+ type.fromUtfProc = UtfToUtf16Proc;
+ type.freeProc = NULL;
+ type.nullSize = 2;
+ type.encodingName = "utf-16le";
+ type.clientData = INT2PTR(1);;
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-16be";
+ type.clientData = INT2PTR(0);
+ Tcl_CreateEncoding(&type);
+ type.encodingName = "utf-16";
+ type.clientData = INT2PTR(isLe.c);;
+ Tcl_CreateEncoding(&type);
+
+#ifndef TCL_NO_DEPRECATED
+ type.encodingName = "unicode";
Tcl_CreateEncoding(&type);
+#endif
/*
* Need the iso8859-1 encoding in order to process binary data, so force
@@ -1279,7 +1314,7 @@ Tcl_ExternalToUtf(
if (*dstCharsPtr <= maxChars) {
break;
}
- dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX;
+ dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
flags = savedFlags;
*statePtr = savedState;
} while (1);
@@ -2401,9 +2436,9 @@ UtfToUtfProc(
/*
*-------------------------------------------------------------------------
*
- * UniCharToUtfProc --
+ * Utf16ToUtfProc --
*
- * Convert from Unicode to UTF-8.
+ * Convert from UTF-16 to UTF-8.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2415,8 +2450,8 @@ UtfToUtfProc(
*/
static int
-UniCharToUtfProc(
- ClientData clientData, /* Not used. */
+Utf16ToUtfProc(
+ ClientData clientData, /* != NULL means LE, == NUL means BE */
const char *src, /* Source string in Unicode. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2468,12 +2503,15 @@ UniCharToUtfProc(
break;
}
+ if (clientData) {
+ ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
+ } else {
+ ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
+ }
/*
* Special case for 1-byte utf chars for speed. Make sure we work with
* unsigned short-size data.
*/
-
- ch = *(unsigned short *)src;
if (ch && ch < 0x80) {
*dst++ = (ch & 0xFF);
} else {
@@ -2491,9 +2529,9 @@ UniCharToUtfProc(
/*
*-------------------------------------------------------------------------
*
- * UtfToUniCharProc --
+ * UtfToUtf16Proc --
*
- * Convert from UTF-8 to Unicode.
+ * Convert from UTF-8 to UTF-16.
*
* Results:
* Returns TCL_OK if conversion was successful.
@@ -2505,9 +2543,8 @@ UniCharToUtfProc(
*/
static int
-UtfToUniCharProc(
- ClientData clientData, /* TableEncodingData that specifies
- * encoding. */
+UtfToUtf16Proc(
+ ClientData clientData, /* != NULL means LE, == NUL means BE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2571,44 +2608,151 @@ UtfToUniCharProc(
* casting dst to a Tcl_UniChar. [Bug 1122671]
*/
-#ifdef WORDS_BIGENDIAN
+ if (clientData) {
#if TCL_UTF_MAX > 4
- if (*chPtr <= 0xFFFF) {
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
- } else {
- *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- }
-#else
- *dst++ = (*chPtr >> 8);
- *dst++ = (*chPtr & 0xFF);
-#endif
+ if (*chPtr <= 0xFFFF) {
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = (*chPtr >> 8);
+ } else {
+ *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
+ *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+ }
#else
-#if TCL_UTF_MAX > 4
- if (*chPtr <= 0xFFFF) {
*dst++ = (*chPtr & 0xFF);
*dst++ = (*chPtr >> 8);
+#endif
} else {
- *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
- *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+#if TCL_UTF_MAX > 4
+ if (*chPtr <= 0xFFFF) {
+ *dst++ = (*chPtr >> 8);
+ *dst++ = (*chPtr & 0xFF);
+ } else {
+ *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
+ *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
+ }
+#else
+ *dst++ = (*chPtr >> 8);
*dst++ = (*chPtr & 0xFF);
- *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
+#endif
}
-#else
- *dst++ = (*chPtr & 0xFF);
- *dst++ = (*chPtr >> 8);
+ }
+ *srcReadPtr = src - srcStart;
+ *dstWrotePtr = dst - dstStart;
+ *dstCharsPtr = numChars;
+ return result;
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfToUcs2Proc --
+ *
+ * Convert from UTF-8 to UCS-2.
+ *
+ * Results:
+ * Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static int
+UtfToUcs2Proc(
+ ClientData clientData, /* != NULL means LE, == NUL means BE */
+ const char *src, /* Source string in UTF-8. */
+ int srcLen, /* Source string length in bytes. */
+ int flags, /* Conversion control flags. */
+ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
+ * information used during a piecewise
+ * conversion. Contents of statePtr are
+ * initialized and/or reset by conversion
+ * routine under control of flags argument. */
+ char *dst, /* Output buffer in which converted string is
+ * stored. */
+ int dstLen, /* The maximum length of output buffer in
+ * bytes. */
+ int *srcReadPtr, /* Filled with the number of bytes from the
+ * source string that were converted. This may
+ * be less than the original source length if
+ * there was a problem converting some source
+ * characters. */
+ int *dstWrotePtr, /* Filled with the number of bytes that were
+ * stored in the output buffer as a result of
+ * the conversion. */
+ int *dstCharsPtr) /* Filled with the number of characters that
+ * correspond to the bytes stored in the
+ * output buffer. */
+{
+ const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
+ int result, numChars;
+#if TCL_UTF_MAX <= 4
+ int len;
#endif
+ Tcl_UniChar ch = 0;
+
+ srcStart = src;
+ srcEnd = src + srcLen;
+ srcClose = srcEnd;
+ if ((flags & TCL_ENCODING_END) == 0) {
+ srcClose -= TCL_UTF_MAX;
+ }
+
+ dstStart = dst;
+ dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
+
+ result = TCL_OK;
+ for (numChars = 0; src < srcEnd; numChars++) {
+ if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+ /*
+ * If there is more string to follow, this will ensure that the
+ * last UTF-8 character in the source buffer hasn't been cut off.
+ */
+
+ result = TCL_CONVERT_MULTIBYTE;
+ break;
+ }
+ if (dst > dstEnd) {
+ result = TCL_CONVERT_NOSPACE;
+ break;
+ }
+#if TCL_UTF_MAX <= 4
+ src += (len = TclUtfToUniChar(src, &ch));
+ if ((ch >= 0xD800) && (len < 3)) {
+ src += TclUtfToUniChar(src, &ch);
+ ch = 0xFFFD;
+ }
+#else
+ src += TclUtfToUniChar(src, &ch);
+ if (ch > 0xFFFF) {
+ ch = 0xFFFD;
+ }
#endif
+
+ /*
+ * Need to handle this in a way that won't cause misalignment by
+ * casting dst to a Tcl_UniChar. [Bug 1122671]
+ */
+
+ if (clientData) {
+ *dst++ = (ch & 0xFF);
+ *dst++ = (ch >> 8);
+ } else {
+ *dst++ = (ch >> 8);
+ *dst++ = (ch & 0xFF);
+ }
}
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
return result;
}
-
+
/*
*-------------------------------------------------------------------------
*