summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog6
-rw-r--r--generic/tclEncoding.c210
-rw-r--r--generic/tclUtf.c2
-rw-r--r--generic/tclZlib.c27
-rw-r--r--tests/encoding.test9
5 files changed, 237 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index dfe776c..69684e9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1552,6 +1552,12 @@
* library/tzdata/America/Kralendijk: (new)
* library/tzdata/America/Lower_Princes: (new)
+2011-07-27 Donal K. Fellows <dkf@users.sf.net>
+
+ * generic/tclEncoding.c (UtfToUtfProc): Start to rough out what needs
+ to change to transition Tcl to being able to work with non-BMP
+ characters, at least at a basic level.
+
2011-07-26 Donal K. Fellows <dkf@users.sf.net>
* generic/tclOO.c (initScript): Ensure that TclOO is properly found by
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 7a55724..4871b85 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -191,6 +191,14 @@ static Tcl_Encoding systemEncoding;
static unsigned short emptyPage[256];
/*
+ * Constants used in the (external) UTF-8 <--> (internal) Modified UTF-8
+ * conversion code.
+ */
+
+#define FROM_STANDARD_UTF8 0
+#define TO_STANDARD_UTF8 1
+
+/*
* Functions used only in this module.
*/
@@ -2160,7 +2168,7 @@ UtfIntToUtfExtProc(
* output buffer. */
{
return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
+ srcReadPtr, dstWrotePtr, dstCharsPtr, TO_STANDARD_UTF8);
}
/*
@@ -2209,7 +2217,7 @@ UtfExtToUtfIntProc(
* output buffer. */
{
return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
+ srcReadPtr, dstWrotePtr, dstCharsPtr, FROM_STANDARD_UTF8);
}
/*
@@ -2230,6 +2238,153 @@ UtfExtToUtfIntProc(
*-------------------------------------------------------------------------
*/
+static INLINE int
+IntToUtf(
+ unsigned ch, /* The character to be stored in the
+ * buffer. */
+ char *buf) /* Buffer in which the UTF-8 representation of
+ * the character is stored. Buffer must be
+ * large enough to hold the UTF-8 character
+ * (at most 6 bytes). */
+{
+ if ((ch > 0) && (ch < 0x80)) {
+ buf[0] = (char) ch;
+ return 1;
+ }
+ if (ch <= 0x7FF) {
+ buf[1] = (char) ((ch | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 6) | 0xC0);
+ return 2;
+ }
+ if (ch <= 0xFFFF) {
+ three:
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
+ }
+ if (ch <= 0x1FFFFF) {
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 18) | 0xF0);
+ return 4;
+ }
+ if (ch <= 0x3FFFFFF) {
+ buf[4] = (char) ((ch | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 24) | 0xF8);
+ return 5;
+ }
+ if (ch <= 0x7FFFFFFF) {
+ buf[5] = (char) ((ch | 0x80) & 0xBF);
+ buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 30) | 0xFC);
+ return 6;
+ }
+
+ ch = 0xFFFD;
+ goto three;
+}
+
+static INLINE int
+UtfToInt(
+ const char *src, /* The UTF-8 string. */
+ unsigned *chPtr) /* Filled with the character represented by
+ * the front of the UTF-8 string. */
+{
+ register int byte;
+
+ /*
+ * Unroll 1 to 6 byte UTF-8 sequences, use loop to handle longer ones.
+ */
+
+ byte = *((unsigned char *) src);
+ if (byte < 0xC0) {
+ /*
+ * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
+ * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
+ * characters representing themselves.
+ */
+
+ *chPtr = (Tcl_UniChar) byte;
+ return 1;
+ } else if (byte < 0xE0) {
+ if ((src[1] & 0xC0) == 0x80) {
+ /*
+ * Two-byte-character lead-byte followed by a trail-byte.
+ */
+
+ *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
+ return 2;
+ }
+
+ /*
+ * A two-byte-character lead-byte not followed by trail-byte
+ * represents itself.
+ */
+
+ *chPtr = (Tcl_UniChar) byte;
+ return 1;
+ } else if (byte < 0xF0) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
+ /*
+ * Three-byte-character lead byte followed by two trail bytes.
+ */
+
+ *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
+ | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
+ return 3;
+ }
+
+ /*
+ * A three-byte-character lead-byte not followed by two trail-bytes
+ * represents itself.
+ */
+
+ *chPtr = (Tcl_UniChar) byte;
+ return 1;
+ } else {
+ int ch, total, trail;
+ static const unsigned char totalBytes[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
+ };
+
+ total = totalBytes[byte];
+ trail = total - 1;
+ if (trail > 0) {
+ ch = byte & (0x3F >> trail);
+ do {
+ src++;
+ if ((*src & 0xC0) != 0x80) {
+ *chPtr = byte;
+ return 1;
+ }
+ ch <<= 6;
+ ch |= (*src & 0x3F);
+ trail--;
+ } while (trail > 0);
+ *chPtr = ch;
+ return total;
+ } else {
+ *chPtr = (Tcl_UniChar) byte;
+ return 1;
+ }
+ }
+}
+
static int
UtfToUtfProc(
ClientData clientData, /* Not used. */
@@ -2256,7 +2411,7 @@ UtfToUtfProc(
int *dstCharsPtr, /* Filled with the number of characters that
* correspond to the bytes stored in the
* output buffer. */
- int pureNullMode) /* Convert embedded nulls from internal
+ int conversionMode) /* Convert embedded nulls from internal
* representation to real null-bytes or vice
* versa. */
{
@@ -2291,15 +2446,16 @@ UtfToUtfProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+ if (UCHAR(*src) < 0x80 &&
+ !(UCHAR(*src) == 0 && conversionMode == FROM_STANDARD_UTF8)) {
/*
* Copy 7bit chatacters, but skip null-bytes when we are in input
* mode, so that they get converted to 0xc080.
*/
*dst++ = *src++;
- } else if (pureNullMode == 1 && UCHAR(*src) == 0xc0 &&
- UCHAR(*(src+1)) == 0x80) {
+ } else if (conversionMode == TO_STANDARD_UTF8 && UCHAR(*src) == 0xc0
+ && UCHAR(*(src+1)) == 0x80) {
/*
* Convert 0xc080 to real nulls when we are in output mode.
*/
@@ -2310,15 +2466,51 @@ UtfToUtfProc(
/*
* Always check before using Tcl_UtfToUniChar. Not doing can so
* cause it run beyond the endof the buffer! If we happen such an
- * incomplete char its byts are made to represent themselves.
+ * incomplete char, its bytes are made to represent themselves.
*/
ch = (unsigned char) *src;
src += 1;
dst += Tcl_UniCharToUtf(ch, dst);
} else {
- src += Tcl_UtfToUniChar(src, &ch);
- dst += Tcl_UniCharToUtf(ch, dst);
+ /*
+ * This is where we ought to do surrogate pair handling, with the
+ * correct way of doing it depending on the conversionMode
+ * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE!
+ */
+
+ if (conversionMode == TO_STANDARD_UTF8) {
+ const char *origin = src;
+
+ src += Tcl_UtfToUniChar(src, &ch);
+ if (ch >= 0xD800 && ch < 0xDBFF) {
+ unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;
+
+ src += Tcl_UtfToUniChar(src, &ch);
+ if (ch >= 0xDC00 && ch < 0xDFFF) {
+ fullChar += 0x2400 + (unsigned) ch;
+ dst += IntToUtf(fullChar, dst);
+ continue;
+ } else {
+ src = origin + Tcl_UtfToUniChar(origin, &ch);
+ }
+ }
+ dst += Tcl_UniCharToUtf(ch, dst);
+ } else {
+ unsigned fullChar;
+
+ src += UtfToInt(src, &fullChar);
+ if (fullChar > 0xFFFF) {
+ fullChar -= 0x10000;
+ ch = (Tcl_UniChar) ((fullChar >> 10) + 0xD800);
+ dst += Tcl_UniCharToUtf(ch, dst);
+ ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
+ dst += Tcl_UniCharToUtf(ch, dst);
+ } else {
+ ch = (Tcl_UniChar) fullChar;
+ dst += Tcl_UniCharToUtf(ch, dst);
+ }
+ }
}
}
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index f0d08e7..74a89c7 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -133,7 +133,7 @@ UtfCount(
#endif
return 3;
}
-
+
/*
*---------------------------------------------------------------------------
*
diff --git a/generic/tclZlib.c b/generic/tclZlib.c
index 544fb6e..7f18b9e 100644
--- a/generic/tclZlib.c
+++ b/generic/tclZlib.c
@@ -1613,8 +1613,7 @@ ZlibCmd(
if (objc < 2) {
Tcl_WrongNumArgs(interp, 1, objv, "command arg ?...?");
return TCL_ERROR;
- }
- if (Tcl_GetIndexFromObj(interp, objv[1], commands, "command", 0,
+ } else if (Tcl_GetIndexFromObj(interp, objv[1], commands, "command", 0,
&command) != TCL_OK) {
return TCL_ERROR;
}
@@ -1972,8 +1971,7 @@ ZlibCmd(
Tcl_SetErrorCode(interp, "TCL", "ZIP", "NOVAL", NULL);
return TCL_ERROR;
}
- if (Tcl_GetIntFromObj(interp, objv[i],
- (int *) &limit) != TCL_OK) {
+ if (Tcl_GetIntFromObj(interp, objv[i], (int *) &limit) != TCL_OK) {
Tcl_AddErrorInfo(interp, "\n (in -limit option)");
return TCL_ERROR;
}
@@ -1984,6 +1982,10 @@ ZlibCmd(
}
}
+ /*
+ * Actually do the push of the instance of the transform.
+ */
+
if (ZlibStackChannelTransform(interp, mode, format, level, chan,
headerObj) == NULL) {
return TCL_ERROR;
@@ -2045,16 +2047,22 @@ ZlibStreamCmd(
ao_buffer, ao_finalize, ao_flush, ao_fullflush
};
+ /*
+ * Basic syntax checks.
+ */
+
if (objc < 2) {
Tcl_WrongNumArgs(interp, 1, objv, "option data ?...?");
return TCL_ERROR;
- }
-
- if (Tcl_GetIndexFromObj(interp, objv[1], cmds, "option", 0,
+ } else if (Tcl_GetIndexFromObj(interp, objv[1], cmds, "option", 0,
&command) != TCL_OK) {
return TCL_ERROR;
}
+ /*
+ * Execute a relevant subcommand.
+ */
+
switch ((enum zlibStreamCommands) command) {
case zs_add: /* $strm add ?$flushopt? $data */
for (i=2; i<objc-1; i++) {
@@ -2104,6 +2112,11 @@ ZlibStreamCmd(
NULL);
return TCL_ERROR;
}
+ if (buffersize < 1 || buffersize > 65536) {
+ Tcl_AppendResult(interp, "buffer size must be between "
+ "1 byte and 64 kibibytes", NULL);
+ return TCL_ERROR;
+ }
}
if (flush == -2) {
diff --git a/tests/encoding.test b/tests/encoding.test
index 0374e2d..9219a69 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -328,6 +328,15 @@ test encoding-15.3 {UtfToUtfProc null character input} {
list [string bytelength $x] [string bytelength $y] $z
} {1 2 c080}
+test encoding-15.4 {UtfToUtfProc: UTF-8 to UTF-16 and back} {
+ set x \xF0\xA4\xAD\xA2; # U+024B62
+ set y [encoding convertfrom utf-8 $x]
+ set z [encoding convertto utf-8 $y]
+ list [string length $x] [string length $y] [string length $z] \
+ [format 0x%04x.0x%04x {*}[scan $y %c%c]] \
+ [format %02x.%02x.%02x.%02x {*}[scan $z %c%c%c%c]]
+} {4 2 4 0xd852.0xdf62 f0.a4.ad.a2}
+
test encoding-16.1 {UnicodeToUtfProc} {
set val [encoding convertfrom unicode NN]
list $val [format %x [scan $val %c]]