summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--generic/tclEncoding.c136
-rw-r--r--tests/encoding.test17
3 files changed, 154 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 2578fd7..9e3cac6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2004-05-27 Reinhard Max <max@suse.de>
+
+ * generic/tclEncoding.c:
+ * tests/encoding.test: added support and tests for translating
+ embedded null characters between real nullbytes and the internal
+ representation on input/output (Bug #949905).
+
2004-05-26 Don Porter <dgp@users.sourceforge.net>
* library/tcltest/tcltest.tcl: Correction to debug prints and testing
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index c596911..d66ce37 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -8,7 +8,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.4 2004/05/06 01:04:53 davygrvy Exp $
+ * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.5 2004/05/27 14:33:18 rmax Exp $
*/
#include "tclInt.h"
@@ -226,6 +226,16 @@ static int UtfToUtfProc _ANSI_ARGS_((ClientData clientData,
CONST char *src, int srcLen, int flags,
Tcl_EncodingState *statePtr, char *dst, int dstLen,
int *srcReadPtr, int *dstWrotePtr,
+ int *dstCharsPtr, int pureNullMode));
+static int UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData,
+ CONST char *src, int srcLen, int flags,
+ Tcl_EncodingState *statePtr, char *dst, int dstLen,
+ int *srcReadPtr, int *dstWrotePtr,
+ int *dstCharsPtr));
+static int UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData,
+ CONST char *src, int srcLen, int flags,
+ Tcl_EncodingState *statePtr, char *dst, int dstLen,
+ int *srcReadPtr, int *dstWrotePtr,
int *dstCharsPtr));
static int TclFindEncodings _ANSI_ARGS_((CONST char *argv0));
@@ -273,8 +283,8 @@ TclInitEncodingSubsystem()
systemEncoding = Tcl_GetEncoding(NULL, type.encodingName);
type.encodingName = "utf-8";
- type.toUtfProc = UtfToUtfProc;
- type.fromUtfProc = UtfToUtfProc;
+ type.toUtfProc = UtfExtToUtfIntProc;
+ type.fromUtfProc = UtfIntToUtfExtProc;
type.freeProc = NULL;
type.nullSize = 1;
type.clientData = NULL;
@@ -1776,6 +1786,105 @@ BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
return result;
}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ * Convert from UTF-8 to UTF-8. While converting null-bytes from
+ * the Tcl's internal representation (0xc0, 0x80) to the official
+ * representation (0x00). See UtfToUtfProc for details.
+ *
+ * Results:
+ * Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr)
+ ClientData clientData; /* Not used. */
+ CONST char *src; /* Source string in UTF-8. */
+ int srcLen; /* Source string length in bytes. */
+ int flags; /* Conversion control flags. */
+ Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+ * state information used during a piecewise
+ * conversion. Contents of statePtr are
+ * initialized and/or reset by conversion
+ * routine under control of flags argument. */
+ char *dst; /* Output buffer in which converted string
+ * is stored. */
+ int dstLen; /* The maximum length of output buffer in
+ * bytes. */
+ int *srcReadPtr; /* Filled with the number of bytes from the
+ * source string that were converted. This
+ * may be less than the original source length
+ * if there was a problem converting some
+ * source characters. */
+ int *dstWrotePtr; /* Filled with the number of bytes that were
+ * stored in the output buffer as a result of
+ * the conversion. */
+ int *dstCharsPtr; /* Filled with the number of characters that
+ * correspond to the bytes stored in the
+ * output buffer. */
+{
+ return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ * Convert from UTF-8 to UTF-8 while converting null-bytes from
+ * the official representation (0x00) to Tcl's internal
+ * representation (0xc0, 0x80). See UtfToUtfProc for details.
+ *
+ * Results:
+ * Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr)
+ ClientData clientData; /* Not used. */
+ CONST char *src; /* Source string in UTF-8. */
+ int srcLen; /* Source string length in bytes. */
+ int flags; /* Conversion control flags. */
+ Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+ * state information used during a piecewise
+ * conversion. Contents of statePtr are
+ * initialized and/or reset by conversion
+ * routine under control of flags argument. */
+ char *dst; /* Output buffer in which converted string
+ * is stored. */
+ int dstLen; /* The maximum length of output buffer in
+ * bytes. */
+ int *srcReadPtr; /* Filled with the number of bytes from the
+ * source string that were converted. This
+ * may be less than the original source length
+ * if there was a problem converting some
+ * source characters. */
+ int *dstWrotePtr; /* Filled with the number of bytes that were
+ * stored in the output buffer as a result of
+ * the conversion. */
+ int *dstCharsPtr; /* Filled with the number of characters that
+ * correspond to the bytes stored in the
+ * output buffer. */
+{
+ return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
+}
+
/*
*-------------------------------------------------------------------------
*
@@ -1796,7 +1905,7 @@ BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
static int
UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr)
+ srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode)
ClientData clientData; /* Not used. */
CONST char *src; /* Source string in UTF-8. */
int srcLen; /* Source string length in bytes. */
@@ -1821,6 +1930,10 @@ UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
int *dstCharsPtr; /* Filled with the number of characters that
* correspond to the bytes stored in the
* output buffer. */
+ int pureNullMode; /* Convert embedded nulls from
+ * internal representation to real
+ * null-bytes or vice versa */
+
{
CONST char *srcStart, *srcEnd, *srcClose;
char *dstStart, *dstEnd;
@@ -1853,8 +1966,21 @@ UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80) {
+ if (UCHAR(*src) < 0x80 &&
+ !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+ /*
+ * Copy 7bit chatacters, but skip null-bytes when we are
+ * in input mode, so that they get converted to 0xc080.
+ */
*dst++ = *src++;
+ } else if (pureNullMode == 1 &&
+ UCHAR(*src) == 0xc0 &&
+ UCHAR(*(src+1)) == 0x80) {
+ /*
+ * Convert 0xc080 to real nulls when we are in output mode.
+ */
+ *dst++ = 0;
+ src += 2;
} else {
src += Tcl_UtfToUniChar(src, &ch);
dst += Tcl_UniCharToUtf(ch, dst);
diff --git a/tests/encoding.test b/tests/encoding.test
index 90ce6d4..83f5f69 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -8,7 +8,7 @@
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
-# RCS: @(#) $Id: encoding.test,v 1.16.2.1 2003/03/27 21:46:32 msofer Exp $
+# RCS: @(#) $Id: encoding.test,v 1.16.2.2 2004/05/27 14:33:20 rmax Exp $
package require tcltest 2
namespace import -force ::tcltest::*
@@ -291,6 +291,21 @@ test encoding-15.1 {UtfToUtfProc} {
encoding convertto utf-8 \xa3
} "\xc2\xa3"
+test encoding-15.2 {UtfToUtfProc null character output} {
+ set x \u0000
+ set y [encoding convertto utf-8 \u0000]
+ set y [encoding convertfrom identity $y]
+ binary scan $y H* z
+ list [string bytelength $x] [string bytelength $y] $z
+} {2 1 00}
+
+test encoding-15.3 {UtfToUtfProc null character input} {
+ set x [encoding convertfrom identity \x00]
+ set y [encoding convertfrom utf-8 $x]
+ binary scan [encoding convertto identity $y] H* z
+ list [string bytelength $x] [string bytelength $y] $z
+} {1 2 c080}
+
test encoding-16.1 {UnicodeToUtfProc} {
encoding convertfrom unicode NN
} "\u4e4e"