From 745b7fd7ab8c61fc34586c00f4cb09945292c3ea Mon Sep 17 00:00:00 2001
From: rmax <rmax>
Date: Thu, 27 May 2004 14:33:17 +0000
Subject: * generic/tclEncoding.c: * tests/encoding.test: added support and
 tests for translating embedded null characters between real nullbytes and the
 internal representation on input/output (Bug #949905).

---
 ChangeLog             |   7 +++
 generic/tclEncoding.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++--
 tests/encoding.test   |  17 ++++++-
 3 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 2578fd7..9e3cac6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2004-05-27  Reinhard Max  <max@suse.de>
+
+        * generic/tclEncoding.c: 
+        * tests/encoding.test: added support and tests for translating
+	embedded null characters between real nullbytes and the internal
+	representation on input/output (Bug #949905).
+
 2004-05-26  Don Porter  <dgp@users.sourceforge.net>
 
 	* library/tcltest/tcltest.tcl:	Correction to debug prints and testing
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index c596911..d66ce37 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.4 2004/05/06 01:04:53 davygrvy Exp $
+ * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.5 2004/05/27 14:33:18 rmax Exp $
  */
 
 #include "tclInt.h"
@@ -226,6 +226,16 @@ static int		UtfToUtfProc _ANSI_ARGS_((ClientData clientData,
 			    CONST char *src, int srcLen, int flags,
 			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
 			    int *srcReadPtr, int *dstWrotePtr,
+			    int *dstCharsPtr, int pureNullMode));
+static int		UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData,
+			    CONST char *src, int srcLen, int flags,
+			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
+			    int *srcReadPtr, int *dstWrotePtr,
+			    int *dstCharsPtr));
+static int		UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData,
+			    CONST char *src, int srcLen, int flags,
+			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
+			    int *srcReadPtr, int *dstWrotePtr,
 			    int *dstCharsPtr));
 static int		TclFindEncodings _ANSI_ARGS_((CONST char *argv0));
 
@@ -273,8 +283,8 @@ TclInitEncodingSubsystem()
     systemEncoding	= Tcl_GetEncoding(NULL, type.encodingName);
 
     type.encodingName	= "utf-8";
-    type.toUtfProc	= UtfToUtfProc;
-    type.fromUtfProc    = UtfToUtfProc;
+    type.toUtfProc	= UtfExtToUtfIntProc;
+    type.fromUtfProc	= UtfIntToUtfExtProc;
     type.freeProc	= NULL;
     type.nullSize	= 1;
     type.clientData	= NULL;
@@ -1776,6 +1786,105 @@ BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
     return result;
 }
 
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ *	Convert from UTF-8 to UTF-8. While converting null-bytes from
+ *	the Tcl's internal representation (0xc0, 0x80) to the official
+ *	representation (0x00). See UtfToUtfProc for details.
+ *
+ * Results:
+ *	Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ *	None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int 
+UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+	     srcReadPtr, dstWrotePtr, dstCharsPtr)
+    ClientData clientData;	/* Not used. */
+    CONST char *src;		/* Source string in UTF-8. */
+    int srcLen;			/* Source string length in bytes. */
+    int flags;			/* Conversion control flags. */
+    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+				 * state information used during a piecewise
+				 * conversion.  Contents of statePtr are
+				 * initialized and/or reset by conversion
+				 * routine under control of flags argument. */
+    char *dst;			/* Output buffer in which converted string
+				 * is stored. */
+    int dstLen;			/* The maximum length of output buffer in
+				 * bytes. */
+    int *srcReadPtr;		/* Filled with the number of bytes from the
+				 * source string that were converted.  This
+				 * may be less than the original source length
+				 * if there was a problem converting some
+				 * source characters. */
+    int *dstWrotePtr;		/* Filled with the number of bytes that were
+				 * stored in the output buffer as a result of
+				 * the conversion. */
+    int *dstCharsPtr;		/* Filled with the number of characters that
+				 * correspond to the bytes stored in the
+				 * output buffer. */
+{
+    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+			srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ *	Convert from UTF-8 to UTF-8 while converting null-bytes from
+ *	the official representation (0x00) to Tcl's internal
+ *	representation (0xc0, 0x80). See UtfToUtfProc for details.
+ *
+ * Results:
+ *	Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ *	None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int 
+UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+	     srcReadPtr, dstWrotePtr, dstCharsPtr)
+    ClientData clientData;	/* Not used. */
+    CONST char *src;		/* Source string in UTF-8. */
+    int srcLen;			/* Source string length in bytes. */
+    int flags;			/* Conversion control flags. */
+    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+				 * state information used during a piecewise
+				 * conversion.  Contents of statePtr are
+				 * initialized and/or reset by conversion
+				 * routine under control of flags argument. */
+    char *dst;			/* Output buffer in which converted string
+				 * is stored. */
+    int dstLen;			/* The maximum length of output buffer in
+				 * bytes. */
+    int *srcReadPtr;		/* Filled with the number of bytes from the
+				 * source string that were converted.  This
+				 * may be less than the original source length
+				 * if there was a problem converting some
+				 * source characters. */
+    int *dstWrotePtr;		/* Filled with the number of bytes that were
+				 * stored in the output buffer as a result of
+				 * the conversion. */
+    int *dstCharsPtr;		/* Filled with the number of characters that
+				 * correspond to the bytes stored in the
+				 * output buffer. */
+{
+    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+			srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
+}
+
 /*
  *-------------------------------------------------------------------------
  *
@@ -1796,7 +1905,7 @@ BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 
 static int 
 UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
-	srcReadPtr, dstWrotePtr, dstCharsPtr)
+	     srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode)
     ClientData clientData;	/* Not used. */
     CONST char *src;		/* Source string in UTF-8. */
     int srcLen;			/* Source string length in bytes. */
@@ -1821,6 +1930,10 @@ UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
     int *dstCharsPtr;		/* Filled with the number of characters that
 				 * correspond to the bytes stored in the
 				 * output buffer. */
+    int pureNullMode;		/* Convert embedded nulls from
+				 * internal representation to real
+				 * null-bytes or vice versa */
+
 {
     CONST char *srcStart, *srcEnd, *srcClose;
     char *dstStart, *dstEnd;
@@ -1853,8 +1966,21 @@ UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	if (UCHAR(*src) < 0x80) {
+	if (UCHAR(*src) < 0x80 &&
+	    !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+	    /*
+	     * Copy 7bit chatacters, but skip null-bytes when we are
+	     * in input mode, so that they get converted to 0xc080.
+	     */
 	    *dst++ = *src++;
+	} else if (pureNullMode == 1 &&
+		   UCHAR(*src) == 0xc0 &&
+		   UCHAR(*(src+1)) == 0x80) {
+	    /* 
+	     * Convert 0xc080 to real nulls when we are in output mode.
+	     */
+	    *dst++ = 0;
+	    src += 2;
 	} else {
 	    src += Tcl_UtfToUniChar(src, &ch);
 	    dst += Tcl_UniCharToUtf(ch, dst);
diff --git a/tests/encoding.test b/tests/encoding.test
index 90ce6d4..83f5f69 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -8,7 +8,7 @@
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 #
-# RCS: @(#) $Id: encoding.test,v 1.16.2.1 2003/03/27 21:46:32 msofer Exp $
+# RCS: @(#) $Id: encoding.test,v 1.16.2.2 2004/05/27 14:33:20 rmax Exp $
 
 package require tcltest 2
 namespace import -force ::tcltest::*
@@ -291,6 +291,21 @@ test encoding-15.1 {UtfToUtfProc} {
     encoding convertto utf-8 \xa3
 } "\xc2\xa3"
 
+test encoding-15.2 {UtfToUtfProc null character output} {
+    set x \u0000
+    set y [encoding convertto utf-8 \u0000]
+    set y [encoding convertfrom identity $y]
+    binary scan $y H* z
+    list [string bytelength $x] [string bytelength $y] $z
+} {2 1 00}
+
+test encoding-15.3 {UtfToUtfProc null character input} {
+    set x [encoding convertfrom identity \x00]
+    set y [encoding convertfrom utf-8 $x]
+    binary scan [encoding convertto identity $y] H* z
+    list [string bytelength $x] [string bytelength $y] $z
+} {1 2 c080}
+
 test encoding-16.1 {UnicodeToUtfProc} {
     encoding convertfrom unicode NN
 } "\u4e4e"
-- 
cgit v0.12