From 0dcff52f4d0961a06b24fcd48f63ce85cf71f169 Mon Sep 17 00:00:00 2001
From: andreas_kupries <akupries@shaw.ca>
Date: Sat, 19 May 2001 16:59:04 +0000
Subject: 	* Note that "tclbench" (see project "tcllib") was extended
 with 	  performance benchmarks for [fcopy] too.

	* doc/fcopy.n: Updated to reflect the extended behaviour of 'fcopy'.

	* tests/io.test: Added tests 'io-52.9', 'io-52.10' and 'io-52.11'
	  to test the handling of encodings by 'fcopy' / 'TclCopychannel'
	  [Bug #209210].

	* generic/tclIO.c: Split of both 'Tcl_ReadChars' and
	  'Tcl_WriteChars' into a public error checking and an internal
	  working part. The public functions now use the new internal
	  ones. The new functions are 'DoReadChars' and 'DoWriteChars'.
	  Extended 'CopyData' to use the new functions 'DoXChars' when
	  required by the encodings on the input and output channels
	  [Bug #209210].
---
 ChangeLog       |  23 ++++++-
 doc/fcopy.n     |  15 ++++-
 generic/tclIO.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 tests/io.test   |  80 ++++++++++++++++++++++-
 4 files changed, 286 insertions(+), 26 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e690d1b..6e395c7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2001-05-19  Andreas Kupries <andreas_kupries@users.sourceforge.net>
+
+	* Note that "tclbench" (see project "tcllib") was extended with
+	  performance benchmarks for [fcopy] too.
+
+	* doc/fcopy.n: Updated to reflect the extended behaviour of 'fcopy'.
+
+	* tests/io.test: Added tests 'io-52.9', 'io-52.10' and 'io-52.11'
+	  to test the handling of encodings by 'fcopy' / 'TclCopychannel'
+	  [Bug #209210].
+
+	* generic/tclIO.c: Split of both 'Tcl_ReadChars' and
+	  'Tcl_WriteChars' into a public error checking and an internal
+	  working part. The public functions now use the new internal
+	  ones. The new functions are 'DoReadChars' and 'DoWriteChars'.
+	  Extended 'CopyData' to use the new functions 'DoXChars' when
+	  required by the encodings on the input and output channels
+	  [Bug #209210].
+
 2001-05-16  Jeff Hobbs  <jeffh@ActiveState.com>
 
 	* library/history.tcl (tcl::HistAdd): prevent empty calls from
@@ -210,7 +229,7 @@
 	Darley's changes to command tracing were added.  A const has been
 	added to the type signature of one of the parameters to
 	Tcl_CommandTraceProc.
-	
+
 2001-04-10  Kevin B. Kenny    <kennykb@acm.org>
 	* unix/tclUnixTime.c: Altered code to use memcpy instead of
 	structure assigments in an effort to achieve better K&R
@@ -220,7 +239,7 @@
 
 	* unix/tclUnixTime.c: Fixed silly typo in calls to 'gmtime' and
 	'localtime' that broke the Linux build.
-	
+
 2001-04-09  Kevin B. Kenny    <kennykb@acm.org>
 
 	* unix/tclLoadShl.c: Added DYNAMIC_PATH to the load flags so that
diff --git a/doc/fcopy.n b/doc/fcopy.n
index 5261f70..30b8dfe 100644
--- a/doc/fcopy.n
+++ b/doc/fcopy.n
@@ -5,7 +5,7 @@
 '\" See the file "license.terms" for information on usage and redistribution
 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 '\" 
-'\" RCS: @(#) $Id: fcopy.n,v 1.2 1998/09/14 18:39:52 stanton Exp $
+'\" RCS: @(#) $Id: fcopy.n,v 1.3 2001/05/19 16:59:04 andreas_kupries Exp $
 '\" 
 .so man.macros
 .TH fcopy n 8.0 Tcl "Tcl Built-In Commands"
@@ -71,6 +71,19 @@ can be different than the number of bytes written to \fIoutchan\fR.
 Only the number of bytes written to \fIoutchan\fR is reported,
 either as the return value of a synchronous \fBfcopy\fP or
 as the argument to the callback for an asynchronous \fBfcopy\fP.
+.PP
+\fBFcopy\fR obeys the encodings configured for the channels. This
+means that the incoming characters are converted internally first
+UTF-8 and then into the encoding of the channel \fBfcopy\fR writes
+to. See the manual entry for \fBfconfigure\fR for details on the
+\fB\-encoding\fR option. No conversion is done if both channels are
+set to encoding "binary". If only the output channel is set to
+encoding "binary" the system will write the internal UTF-8
+representation of the incoming characters. If only the input channel
+is set to encoding "binary" the system will assume that the incoming
+bytes are valid UTF-8 characters and convert them according to the
+output encoding. The behaviour of the system for bytes which are not
+valid UTF-8 characters is undefined in this case.
 
 .SH EXAMPLE
 .PP
diff --git a/generic/tclIO.c b/generic/tclIO.c
index 2c1bc34..f66e96b 100644
--- a/generic/tclIO.c
+++ b/generic/tclIO.c
@@ -10,7 +10,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclIO.c,v 1.29 2001/03/30 23:06:39 andreas_kupries Exp $
+ * RCS: @(#) $Id: tclIO.c,v 1.30 2001/05/19 16:59:04 andreas_kupries Exp $
  */
 
 #include "tclInt.h"
@@ -112,6 +112,10 @@ static int		DoRead _ANSI_ARGS_((Channel *chanPtr, char *srcPtr,
 				int slen));
 static int		DoWrite _ANSI_ARGS_((Channel *chanPtr, char *src,
 				int srcLen));
+static int		DoReadChars _ANSI_ARGS_ ((Channel* chan,
+				Tcl_Obj* objPtr, int toRead, int appendFlag));
+static int		DoWriteChars _ANSI_ARGS_ ((Channel* chan,
+				CONST char* src, int len));
 static int		FilterInputBytes _ANSI_ARGS_((Channel *chanPtr,
 				GetsState *statePtr));
 static int		FlushChannel _ANSI_ARGS_((Tcl_Interp *interp,
@@ -2508,7 +2512,10 @@ Tcl_ClearChannelHandlers (channel)
  *	Puts a sequence of bytes into an output buffer, may queue the
  *	buffer for output if it gets full, and also remembers whether the
  *	current buffer is ready e.g. if it contains a newline and we are in
- *	line buffering mode.
+ *	line buffering mode. Compensates stacking, i.e. will redirect the
+ *	data from the specified channel to the topmost channel in a stack.
+ *
+ *	No encoding conversions are applied to the bytes being read.
  *
  * Results:
  *	The number of bytes written or -1 in case of error. If -1,
@@ -2555,7 +2562,10 @@ Tcl_Write(chan, src, srcLen)
  *	Puts a sequence of bytes into an output buffer, may queue the
  *	buffer for output if it gets full, and also remembers whether the
  *	current buffer is ready e.g. if it contains a newline and we are in
- *	line buffering mode.
+ *	line buffering mode. Writes directly to the driver of the channel,
+ *	does not compensate for stacking.
+ *
+ *	No encoding conversions are applied to the bytes being read.
  *
  * Results:
  *	The number of bytes written or -1 in case of error. If -1,
@@ -2611,7 +2621,8 @@ Tcl_WriteRaw(chan, src, srcLen)
  *	using the channel's current encoding, may queue the buffer for
  *	output if it gets full, and also remembers whether the current
  *	buffer is ready e.g. if it contains a newline and we are in
- *	line buffering mode.
+ *	line buffering mode. Compensates stacking, i.e. will redirect the
+ *	data from the specified channel to the topmost channel in a stack.
  *
  * Results:
  *	The number of bytes written or -1 in case of error. If -1,
@@ -2631,18 +2642,55 @@ Tcl_WriteChars(chan, src, len)
     int len;			/* Length of string in bytes, or < 0 for 
 				 * strlen(). */
 {
-    /*
-     * Always use the topmost channel of the stack
-     */
-    Channel *chanPtr;
     ChannelState *statePtr;	/* state info for channel */
 
     statePtr = ((Channel *) chan)->state;
-    chanPtr  = statePtr->topChanPtr;
 
     if (CheckChannelErrors(statePtr, TCL_WRITABLE) != 0) {
 	return -1;
     }
+
+    return DoWriteChars ((Channel*) chan, src, len);
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * DoWriteChars --
+ *
+ *	Takes a sequence of UTF-8 characters and converts them for output
+ *	using the channel's current encoding, may queue the buffer for
+ *	output if it gets full, and also remembers whether the current
+ *	buffer is ready e.g. if it contains a newline and we are in
+ *	line buffering mode. Compensates stacking, i.e. will redirect the
+ *	data from the specified channel to the topmost channel in a stack.
+ *
+ * Results:
+ *	The number of bytes written or -1 in case of error. If -1,
+ *	Tcl_GetErrno will return the error code.
+ *
+ * Side effects:
+ *	May buffer up output and may cause output to be produced on the
+ *	channel.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+DoWriteChars(chanPtr, src, len)
+    Channel* chanPtr;		/* The channel to buffer output for. */
+    CONST char *src;		/* UTF-8 characters to queue in output buffer. */
+    int len;			/* Length of string in bytes, or < 0 for 
+				 * strlen(). */
+{
+    /*
+     * Always use the topmost channel of the stack
+     */
+    ChannelState *statePtr;	/* state info for channel */
+
+    statePtr = chanPtr->state;
+    chanPtr  = statePtr->topChanPtr;
+
     if (len < 0) {
         len = strlen(src);
     }
@@ -4037,12 +4085,8 @@ Tcl_ReadChars(chan, objPtr, toRead, appendFlag)
 				 * of the object. */
 
 {
-    Channel *chanPtr = (Channel *) chan;
-    ChannelState *statePtr = chanPtr->state;	/* state info for channel */
-    ChannelBuffer *bufPtr;
-    int offset, factor, copied, copiedNow, result;
-    Tcl_Encoding encoding;
-#define UTF_EXPANSION_FACTOR	1024
+    Channel*      chanPtr  = (Channel *) chan;
+    ChannelState* statePtr = chanPtr->state;	/* state info for channel */
     
     /*
      * This operation should occur at the top of a channel stack.
@@ -4051,12 +4095,64 @@ Tcl_ReadChars(chan, objPtr, toRead, appendFlag)
     chanPtr = statePtr->topChanPtr;
 
     if (CheckChannelErrors(statePtr, TCL_READABLE) != 0) {
-	copied = -1;
-	goto done;
+        /*
+	 * Update the notifier state so we don't block while there is still
+	 * data in the buffers.
+	 */
+        UpdateInterest(chanPtr);
+	return -1;
     }
 
+    return DoReadChars (chanPtr, objPtr, toRead, appendFlag);
+}
+/*
+ *---------------------------------------------------------------------------
+ *
+ * DoReadChars --
+ *
+ *	Reads from the channel until the requested number of characters
+ *	have been seen, EOF is seen, or the channel would block.  EOL
+ *	and EOF translation is done.  If reading binary data, the raw
+ *	bytes are wrapped in a Tcl byte array object.  Otherwise, the raw
+ *	bytes are converted to UTF-8 using the channel's current encoding
+ *	and stored in a Tcl string object.
+ *
+ * Results:
+ *	The number of characters read, or -1 on error. Use Tcl_GetErrno()
+ *	to retrieve the error code for the error that occurred.
+ *
+ * Side effects:
+ *	May cause input to be buffered.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+int
+DoReadChars(chanPtr, objPtr, toRead, appendFlag)
+    Channel* chanPtr;		/* The channel to read. */
+    Tcl_Obj *objPtr;		/* Input data is stored in this object. */
+    int toRead;			/* Maximum number of characters to store,
+				 * or -1 to read all available data (up to EOF
+				 * or when channel blocks). */
+    int appendFlag;		/* If non-zero, data read from the channel
+				 * will be appended to the object.  Otherwise,
+				 * the data will replace the existing contents
+				 * of the object. */
+
+{
+    ChannelState *statePtr = chanPtr->state;	/* state info for channel */
+    ChannelBuffer *bufPtr;
+    int offset, factor, copied, copiedNow, result;
+    Tcl_Encoding encoding;
+#define UTF_EXPANSION_FACTOR	1024
+    
+    /*
+     * This operation should occur at the top of a channel stack.
+     */
+
+    chanPtr  = statePtr->topChanPtr;
     encoding = statePtr->encoding;
-    factor = UTF_EXPANSION_FACTOR;
+    factor   = UTF_EXPANSION_FACTOR;
 
     if (appendFlag == 0) {
 	if (encoding == NULL) {
@@ -7037,6 +7133,11 @@ CopyData(csPtr, mask)
     int result = TCL_OK;
     int size;
     int total;
+    int sizeb;
+    Tcl_Obj* bufObj = NULL;
+    char* buffer;
+
+    int inBinary, outBinary, sameEncoding; /* Encoding control */
 
     inChan	= (Tcl_Channel) csPtr->readPtr;
     outChan	= (Tcl_Channel) csPtr->writePtr;
@@ -7053,8 +7154,16 @@ CopyData(csPtr, mask)
      * thus gets the bottom of the stack.
      */
 
-    while (csPtr->toRead != 0) {
+    inBinary     = (inStatePtr->encoding  == NULL);
+    outBinary    = (outStatePtr->encoding == NULL);
+    sameEncoding = (inStatePtr->encoding  == outStatePtr->encoding);
+
+    if (!(inBinary || sameEncoding)) {
+        bufObj = Tcl_NewObj ();
+	Tcl_IncrRefCount (bufObj);
+    }
 
+    while (csPtr->toRead != 0) {
 	/*
 	 * Check for unreported background errors.
 	 */
@@ -7079,7 +7188,12 @@ CopyData(csPtr, mask)
 	} else {
 	    size = csPtr->toRead;
 	}
-	size = DoRead(inStatePtr->topChanPtr, csPtr->buffer, size);
+
+	if (inBinary || sameEncoding) {
+	    size = DoRead(inStatePtr->topChanPtr, csPtr->buffer, size);
+	} else {
+	    size = DoReadChars(inStatePtr->topChanPtr, bufObj, size, 0 /* No append */);
+	}
 
 	if (size < 0) {
 	    readError:
@@ -7105,6 +7219,10 @@ CopyData(csPtr, mask)
 		Tcl_CreateChannelHandler(inChan, TCL_READABLE,
 			CopyEventProc, (ClientData) csPtr);
 	    }
+	    if (bufObj != (Tcl_Obj*) NULL) {
+	        Tcl_DecrRefCount (bufObj);
+		bufObj = (Tcl_Obj*) NULL;
+	    }
 	    return TCL_OK;
 	}
 
@@ -7112,8 +7230,25 @@ CopyData(csPtr, mask)
 	 * Now write the buffer out.
 	 */
 
-	size = DoWrite(outStatePtr->topChanPtr, csPtr->buffer, size);
-	if (size < 0) {
+	if (inBinary || sameEncoding) {
+	    buffer = csPtr->buffer;
+	    sizeb = size;
+	} else {
+	    buffer = Tcl_GetStringFromObj (bufObj, &sizeb);
+	}
+
+	if (outBinary || sameEncoding) {
+	    sizeb = DoWrite(outStatePtr->topChanPtr, buffer, sizeb);
+	} else {
+	    sizeb = DoWriteChars(outStatePtr->topChanPtr, buffer, sizeb);
+	}
+
+	if (inBinary || sameEncoding) {
+	    /* Both read and write counted bytes */
+	    size = sizeb;
+	} /* else : Read counted characters, write counted bytes, i.e. size != sizeb */
+
+	if (sizeb < 0) {
 	    writeError:
 	    errObj = Tcl_NewObj();
 	    Tcl_AppendStringsToObj(errObj, "error writing \"",
@@ -7148,6 +7283,10 @@ CopyData(csPtr, mask)
 		Tcl_CreateChannelHandler(outChan, TCL_WRITABLE,
 			CopyEventProc, (ClientData) csPtr);
 	    }
+	    if (bufObj != (Tcl_Obj*) NULL) {
+	        Tcl_DecrRefCount (bufObj);
+		bufObj = (Tcl_Obj*) NULL;
+	    }
 	    return TCL_OK;
 	}
 
@@ -7166,8 +7305,17 @@ CopyData(csPtr, mask)
 		Tcl_CreateChannelHandler(outChan, TCL_WRITABLE,
 			CopyEventProc, (ClientData) csPtr);
 	    }
+	    if (bufObj != (Tcl_Obj*) NULL) {
+	        Tcl_DecrRefCount (bufObj);
+		bufObj = (Tcl_Obj*) NULL;
+	    }
 	    return TCL_OK;
 	}
+    } /* while */
+
+    if (bufObj != (Tcl_Obj*) NULL) {
+        Tcl_DecrRefCount (bufObj);
+	bufObj = (Tcl_Obj*) NULL;
     }
 
     /*
@@ -7218,6 +7366,8 @@ CopyData(csPtr, mask)
  *
  *	Reads a given number of bytes from a channel.
  *
+ *	No encoding conversions are applied to the bytes being read.
+ *
  * Results:
  *	The number of characters read, or -1 on error. Use Tcl_GetErrno()
  *	to retrieve the error code for the error that occurred.
diff --git a/tests/io.test b/tests/io.test
index e3ac4a1..c193793 100644
--- a/tests/io.test
+++ b/tests/io.test
@@ -12,7 +12,7 @@
 # See the file "license.terms" for information on usage and redistribution
 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 #
-# RCS: @(#) $Id: io.test,v 1.16 2001/04/04 17:35:25 andreas_kupries Exp $
+# RCS: @(#) $Id: io.test,v 1.17 2001/05/19 16:59:04 andreas_kupries Exp $
 
 if {[lsearch [namespace children] ::tcltest] == -1} {
     package require tcltest
@@ -6369,6 +6369,84 @@ test io-52.8 {TclCopyChannel} {stdio} {
     list $s0 [file size test1]
 } {40 40}
 
+# Empty files, to register them with the test facility
+makeFile {} kyrillic.txt
+makeFile {} utf8-fcopy.txt
+makeFile {} utf8-rp.txt
+
+# Create kyrillic file
+set out [open kyrillic.txt w]
+fconfigure $out -encoding koi8-r
+puts       $out "\u0410\u0410"
+close      $out
+
+test io-52.9 {TclCopyChannel & encodings} {
+    # Copy kyrillic to UTF-8, using fcopy.
+
+    set in  [open kyrillic.txt r]
+    set out [open utf8-fcopy.txt w]
+
+    fconfigure $in  -encoding koi8-r
+    fconfigure $out -encoding utf-8
+
+    fcopy $in $out
+    close $in
+    close $out
+
+    # Do the same again, but differently (read/puts).
+
+    set in  [open kyrillic.txt r]
+    set out [open utf8-rp.txt w]
+
+    fconfigure $in  -encoding koi8-r
+    fconfigure $out -encoding utf-8
+
+    puts -nonewline $out [read $in]
+
+    close $in
+    close $out
+
+    list \
+	    [file size kyrillic.txt]   \
+	    [file size utf8-fcopy.txt] \
+	    [file size utf8-rp.txt]
+} {3 5 5}
+
+test io-52.10 {TclCopyChannel & encodings} {
+    # encoding to binary (=> implies that the
+    # internal utf-8 is written)
+
+    set in  [open kyrillic.txt r]
+    set out [open utf8-fcopy.txt w]
+
+    fconfigure $in  -encoding koi8-r
+    fconfigure $out -encoding binary
+
+    fcopy $in $out
+    close $in
+    close $out
+
+    file size utf8-fcopy.txt
+} 5
+
+test io-52.11 {TclCopyChannel & encodings} {
+    # binary to encoding => the input has to be
+    # in utf-8 to make sense to the encoder
+
+    set in  [open utf8-fcopy.txt r]
+    set out [open kyrillic.txt w]
+
+    fconfigure $in  -encoding binary
+    fconfigure $out -encoding koi8-r
+
+    fcopy $in $out
+    close $in
+    close $out
+
+    file size kyrillic.txt
+} 3
+
+
 test io-53.1 {CopyData} {
     removeFile test1
     set f1 [open $thisScript]
-- 
cgit v0.12