From 5d3ba90d87622e64517625053b18380f5dd96908 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Mon, 3 Apr 2023 20:58:46 +0000 Subject: Fix for [fa3d9fd818fa0072], [fcopy $chan1 $chan2 -size $size] is not [puts -nonewline $chan2 [read $chan1 -size $size]. --- doc/chan.n | 153 ++++++++++++++++++++++++++++++++++++-------------- doc/fcopy.n | 114 +++++++++++-------------------------- generic/tclEncoding.c | 3 +- generic/tclIO.c | 98 ++++++++++++++++++++------------ tests/chanio.test | 21 ++++--- tests/io.test | 52 ++++++++++++----- 6 files changed, 260 insertions(+), 181 deletions(-) diff --git a/doc/chan.n b/doc/chan.n index e8601f6..14fa941 100644 --- a/doc/chan.n +++ b/doc/chan.n @@ -226,50 +226,119 @@ typically used on UNIX platforms, .TP \fBchan copy \fIinputChan outputChan\fR ?\fB\-size \fIsize\fR? ?\fB\-command \fIcallback\fR? . -Copies data from \fIinputChan\fR to \fIoutputChan\fR, leveraging internal -buffers to avoid extra copies and to avoid buffering too much data in main -memory when copying large files to slow destinations like network sockets. +Reads characters from \fIinputChan\fR and writes them to \fIoutputChan\fR until +all characters are copied, blocking until the copy is complete and returning +the number of characters copied. Leverages internal buffers to avoid extra +copies and to avoid buffering too much data in main memory when copying large +files to slow destinations like network sockets. .RS .PP -If \fB\-size\fR is given, the size is in bytes if the two channels have the -same encoding and in characters otherwise, and only that amount is copied. -Otherwise, all data until the end of the file is copied. - -\fBchan copy\fR blocks until the copy is complete and returns the number of -bytes or characters written to \fIoutputChan\fR. -.PP -If \fB\-command\fR is given, \fBchan copy\fR returns immediately, the copy is -carried out in the background, and then \fIcallback\fR is called with the -number of bytes written to \fIoutputChan\fR as its first argument, and the -error message for any error that occurred as its second argument. -\fIinputChan\fR and \fIoutputChan\fR are automatically configured for -non-blocking mode if needed. Background copying only works correctly if the -event loop is active, e.g. via \fBvwait\fR or Tk. -.PP -During a background copy no other read or write operation may be performed on -\fIinputChan\fR or \fIoutputChan\fR. If either \fIinputChan\fR or -\fIoutputChan\fR is closed while the copy is in progress copying ceases and -\fBno\fR callback is made. If \fIinputChan\fR is closed all data already queued -is written to \fIoutputChan\fR. -.PP -The should be no event handler established for \fIinputChan\fR because it may -become readable during a background copy. An attempt to read or write -from within an event handler results result in the error, "channel busy". -.PP -Due to end-of-line translation the number of bytes read from \fIinputChan\fR -may be different than the number of bytes written to \fIoutputChan\fR. Only -the number of bytes written to \fIoutputChan\fR is reported. -.PP -\fBChan copy\fR reads the data according to the \fB\-encoding\fR, -\fB\-translation\fR, and \fB\-eofchar\fR of the source and writes to the -destination according to the configuration for that channel. If the encoding -and translation of both channels is \fBbinary\fR and the \fB\-eofchar\fR of -both channels is the empty string, an identical copy is made. If only the -encoding of the destination is \fBbinary\fR, Tcl's internal modified UTF-8 -representation of the characters read from the source is written to the -destination. If only the encoding of the source is \fBbinary\fR, each byte read -becomes one Unicode character in the range of 0 to 255, and that character is -subject to the encoding and translation of the destination as it is written. +\fB\-size\fR limits the number of characters copied. +.PP +If \fB\-command\fR is gviven, \fBchan copy\fR returns immediately, works in the +background, and calls \fIcallback\fR when the copy completes, providing as an +additional argument the number of characters written to \fIoutputChan\fR. If +an error occurres during the background copy, another argument provides message +for the error. \fIinputChan\fR and \fIoutputChan\fR are automatically +configured for non-blocking mode if needed. Background copying only works +correctly if events are being processed, e.g. via \fBvwait\fR or Tk. +.PP +During a background copy no other read operation may be performed on +\fIinputChan\fR, and no write operation may be performed on +\fIoutputChan\fR. However, write operations may by performed on +\fIinputChan\fR and read operations may be performed on \fIoutputChan\fR, as +exhibited by the bidirectional copy example below. +.PP +If either \fIinputChan\fR or \fIoutputChan\fR is closed while the copy is in +progress, copying ceases and \fBno\fR callback is made. If \fIinputChan\fR is +closed all data already queued is written to \fIoutputChan\fR. +.PP +There should be no event handler established for \fIinputChan\fR because it +may become readable during a background copy. An attempt to read or write from +within an event handler results result in the error, "channel busy". Any +wrong-sided I/O attempted (by a \fBfileevent\fR handler or otherwise) results +in a +.QW "channel busy" +error. +.PP +.PP +.IP \fBEXAMPLES\fR +.PP +The first example transfers the contents of one channel exactly to +another. Note that when copying one file to another, it is better to +use \fBfile copy\fR which also copies file metadata (e.g. the file +access permissions) where possible. +.PP +.CS +fconfigure $in -translation binary +fconfigure $out -translation binary +\fBfcopy\fR $in $out +.CE +.PP +This second example shows how the callback gets +passed the number of bytes transferred. +It also uses vwait to put the application into the event loop. +Of course, this simplified example could be done without the command +callback. +.PP +.CS +proc Cleanup {in out bytes {error {}}} { + global total + set total $bytes + close $in + close $out + if {[string length $error] != 0} { + # error occurred during the copy + } +} +set in [open $file1] +set out [socket $server $port] +\fBfcopy\fR $in $out -command [list Cleanup $in $out] +vwait total +.CE +.PP +The third example copies in chunks and tests for end of file +in the command callback. +.PP +.CS +proc CopyMore {in out chunk bytes {error {}}} { + global total done + incr total $bytes + if {([string length $error] != 0) || [eof $in]} { + set done $total + close $in + close $out + } else { + \fBfcopy\fR $in $out -size $chunk \e + -command [list CopyMore $in $out $chunk] + } +} +set in [open $file1] +set out [socket $server $port] +set chunk 1024 +set total 0 +\fBfcopy\fR $in $out -size $chunk \e + -command [list CopyMore $in $out $chunk] +vwait done +.CE +.PP +The fourth example starts an asynchronous, bidirectional fcopy between +two sockets. Those could also be pipes from two [open "|hal 9000" r+] +(though their conversation would remain secret to the script, since +all four fileevent slots are busy). +.PP +.CS +set flows 2 +proc Done {dir args} { + global flows done + puts "$dir is over." + incr flows -1 + if {$flows<=0} {set done 1} +} +\fBfcopy\fR $sok1 $sok2 -command [list Done UP] +\fBfcopy\fR $sok2 $sok1 -command [list Done DOWN] +vwait done +.CE .RE .TP \fBchan create \fImode cmdPrefix\fR diff --git a/doc/fcopy.n b/doc/fcopy.n index 477f242..b043898 100644 --- a/doc/fcopy.n +++ b/doc/fcopy.n @@ -12,90 +12,44 @@ .SH NAME fcopy \- Copy data from one channel to another .SH SYNOPSIS -\fBfcopy \fIinchan\fR \fIoutchan\fR ?\fB\-size \fIsize\fR? ?\fB\-command \fIcallback\fR? +\fBfcopy \fIinputChan\fR \fIoutputChan\fR ?\fB\-size \fIsize\fR? ?\fB\-command \fIcallback\fR? .BE .SH DESCRIPTION .PP -The \fBfcopy\fR command copies data from one I/O channel, \fIinchan\fR to another I/O channel, \fIoutchan\fR. -The \fBfcopy\fR command leverages the buffering in the Tcl I/O system to -avoid extra copies and to avoid buffering too much data in -main memory when copying large files to slow destinations like -network sockets. -.PP -The \fBfcopy\fR -command transfers data from \fIinchan\fR until end of file -or \fIsize\fR bytes or characters have been -transferred; \fIsize\fR is in bytes if the input channel is in binary mode, -and is in characters otherwise. -If no \fB\-size\fR argument is given, -then the copy goes until end of file. -All the data read from \fIinchan\fR is copied to \fIoutchan\fR. -Without the \fB\-command\fR option, \fBfcopy\fR blocks until the copy is complete -and returns the number of bytes or characters (using the same rules as -for the \fB\-size\fR option) written to \fIoutchan\fR. -.PP -The \fB\-command\fR argument makes \fBfcopy\fR work in the background. -In this case it returns immediately and the \fIcallback\fR is invoked -later when the copy completes. -The \fIcallback\fR is called with -one or two additional -arguments that indicates how many bytes were written to \fIoutchan\fR. -If an error occurred during the background copy, the second argument is the -error string associated with the error. -With a background copy, -it is not necessary to put \fIinchan\fR or \fIoutchan\fR into -non-blocking mode; the \fBfcopy\fR command takes care of that automatically. -However, it is necessary to enter the event loop by using -the \fBvwait\fR command or by using Tk. -.PP -You are not allowed to do other input operations with \fIinchan\fR, or -output operations with \fIoutchan\fR, during a background -\fBfcopy\fR. The converse is entirely legitimate, as exhibited by the -bidirectional fcopy example below. -.PP -If either \fIinchan\fR or \fIoutchan\fR get closed -while the copy is in progress, the current copy is stopped -and the command callback is \fInot\fR made. -If \fIinchan\fR is closed, -then all data already queued for \fIoutchan\fR is written out. -.PP -Note that \fIinchan\fR can become readable during a background copy. -You should turn off any \fBfileevent\fR handlers during a background -copy so those handlers do not interfere with the copy. -Any wrong-sided I/O attempted (by a \fBfileevent\fR handler or otherwise) will get a +Reads characters from \fIinputChan\fR and writes them to \fIoutputChan\fR until +all characters are copied, blocking until the copy is complete and returning +the number of characters copied. Leverages internal buffers to avoid extra +copies and to avoid buffering too much data in main memory when copying large +files to slow destinations like network sockets. +.PP +\fB\-size\fR limits the number of characters copied. +.PP +\fB\-command\fR makes \fBfcopy\fR return immediately, work in the background, +and call \fIcallback\fR when the copy completes, providing as an additional +argument the number of characters written to \fIoutputChan\fR. If an error +occurres during the background copy, another argument provides the message for +the error. \fIinputChan\fR and \fIoutputChan\fR are automatically configured +for non-blocking mode if needed. Background copying only works correctly if +events are being processed e.g. via \fBvwait\fR or Tk. +.PP +During a background copy no other read operation may be performed on +\fIinputChan\fR, and no other write operation may be performed on +\fIoutputChan\fR. However, write operations may by performed on +\fIinputChan\fR and read operations may be performed on \fIoutputChan\fR, as +exhibited by the bidirectional copy example below. +.PP +If either \fIinputChan\fR or \fIoutputChan\fR is closed while the copy is in +progress, copying ceases and \fBno\fR callback is made. If \fIinputChan\fR is +closed all data already queued is written to \fIoutputChan\fR. +.PP +There should be no event handler established for \fIinputChan\fR because it +may become readable during a background copy. An attempt to read or write from +within an event handler results result in the error, "channel busy". Any +wrong-sided I/O attempted (by a \fBfileevent\fR handler or otherwise) results +in a .QW "channel busy" error. -.PP -\fBFcopy\fR translates end-of-line sequences in \fIinchan\fR and \fIoutchan\fR -according to the \fB\-translation\fR option -for these channels. -See the manual entry for \fBfconfigure\fR for details on the -\fB\-translation\fR option. -The translations mean that the number of bytes read from \fIinchan\fR -can be different than the number of bytes written to \fIoutchan\fR. -Only the number of bytes written to \fIoutchan\fR is reported, -either as the return value of a synchronous \fBfcopy\fR or -as the argument to the callback for an asynchronous \fBfcopy\fR. -.PP -\fBFcopy\fR obeys the encodings and character translations configured -for the channels. This -means that the incoming characters are converted internally first -UTF-8 and then into the encoding of the channel \fBfcopy\fR writes -to. See the manual entry for \fBfconfigure\fR for details on the -\fB\-encoding\fR and \fB\-translation\fR options. No conversion is -done if both channels are -set to encoding -.QW binary -and have matching translations. If only the output channel is set to encoding -.QW binary -the system will write the internal UTF-8 representation of the incoming -characters. If only the input channel is set to encoding -.QW binary -the system will assume that the incoming -bytes are valid UTF-8 characters and convert them according to the -output encoding. The behaviour of the system for bytes which are not -valid UTF-8 characters is undefined in this case. .SH EXAMPLES .PP The first example transfers the contents of one channel exactly to @@ -144,7 +98,7 @@ proc CopyMore {in out chunk bytes {error {}}} { close $out } else { \fBfcopy\fR $in $out -size $chunk \e - -command [list CopyMore $in $out $chunk] + -command [list CopyMore $in $out $chunk] } } set in [open $file1] @@ -152,7 +106,7 @@ set out [socket $server $port] set chunk 1024 set total 0 \fBfcopy\fR $in $out -size $chunk \e - -command [list CopyMore $in $out $chunk] + -command [list CopyMore $in $out $chunk] vwait done .CE .PP diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 2a96383..689fa50 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2570,8 +2570,7 @@ UtfToUtfProc( } } else { /* - * Convert 0xC080 to real nulls when we are in output mode, - * irrespective of the profile. + * For output convert 0xC080 to a real null. */ *dst++ = 0; src += 2; diff --git a/generic/tclIO.c b/generic/tclIO.c index 73d787a..a45f39a 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -174,6 +174,8 @@ static int CloseWrite(Tcl_Interp *interp, Channel *chanPtr); static void CommonGetsCleanup(Channel *chanPtr); static int CopyData(CopyState *csPtr, int mask); static void DeleteTimerHandler(ChannelState *statePtr); +int Lossless(ChannelState *inStatePtr, + ChannelState *outStatePtr, long long toRead); static int MoveBytes(CopyState *csPtr); static void MBCallback(CopyState *csPtr, Tcl_Obj *errObj); @@ -338,6 +340,9 @@ static const Tcl_ObjType chanObjType = { TCL_OBJTYPE_V0 }; +#define GetIso88591() \ + (binaryEncoding ? Tcl_GetEncoding(NULL, "iso8859-1") : binaryEncoding) + #define ChanSetInternalRep(objPtr, resPtr) \ do { \ Tcl_ObjInternalRep ir; \ @@ -9364,18 +9369,7 @@ TclCopyChannel( ResetFlag(outStatePtr, CHANNEL_LINEBUFFERED); SetFlag(outStatePtr, CHANNEL_UNBUFFERED); - /* - * Test for conditions where we know we can just move bytes from input - * channel to output channel with no transformation or even examination - * of the bytes themselves. - */ - - moveBytes = inStatePtr->inEofChar == '\0' /* No eofChar to stop input */ - && inStatePtr->inputTranslation == TCL_TRANSLATE_LF - && outStatePtr->outputTranslation == TCL_TRANSLATE_LF - && inStatePtr->encoding == outStatePtr->encoding - && CHANNEL_PROFILE_GET(inStatePtr->inputEncodingFlags) == TCL_ENCODING_PROFILE_TCL8 - && CHANNEL_PROFILE_GET(outStatePtr->outputEncodingFlags) == TCL_ENCODING_PROFILE_TCL8; + moveBytes = Lossless(inStatePtr, outStatePtr, toRead); /* * Allocate a new CopyState to maintain info about the current copy in @@ -9682,8 +9676,7 @@ CopyData( Tcl_WideInt total; Tcl_WideInt size; /* TODO - be careful if total and size are made unsigned */ const char *buffer; - int inBinary, outBinary, sameEncoding; - /* Encoding control */ + int moveBytes; int underflow; /* Input underflow */ inChan = (Tcl_Channel) csPtr->readPtr; @@ -9701,13 +9694,9 @@ CopyData( * the bottom of the stack. */ - inBinary = (inStatePtr->encoding == NULL); - outBinary = (outStatePtr->encoding == NULL); - sameEncoding = inStatePtr->encoding == outStatePtr->encoding - && CHANNEL_PROFILE_GET(inStatePtr->inputEncodingFlags) == TCL_ENCODING_PROFILE_TCL8 - && CHANNEL_PROFILE_GET(outStatePtr->outputEncodingFlags) == TCL_ENCODING_PROFILE_TCL8; + moveBytes = Lossless(inStatePtr, outStatePtr, csPtr->toRead); - if (!(inBinary || sameEncoding)) { + if (!moveBytes) { TclNewObj(bufObj); Tcl_IncrRefCount(bufObj); } @@ -9748,7 +9737,7 @@ CopyData( underflow = 1; } else { /* - * Read up to bufSize bytes. + * Read up to bufSize characters. */ if ((csPtr->toRead == (Tcl_WideInt) -1) @@ -9758,7 +9747,7 @@ CopyData( sizeb = csPtr->toRead; } - if (inBinary || sameEncoding) { + if (moveBytes) { size = DoRead(inStatePtr->topChanPtr, csPtr->buffer, sizeb, !GotFlag(inStatePtr, CHANNEL_NONBLOCKING)); } else { @@ -9825,25 +9814,20 @@ CopyData( * Now write the buffer out. */ - if (inBinary || sameEncoding) { + if (moveBytes) { buffer = csPtr->buffer; - sizeb = size; + sizeb = WriteBytes(outStatePtr->topChanPtr, buffer, size); } else { buffer = Tcl_GetStringFromObj(bufObj, &sizeb); - } - - if (outBinary || sameEncoding) { - sizeb = WriteBytes(outStatePtr->topChanPtr, buffer, sizeb); - } else { sizeb = WriteChars(outStatePtr->topChanPtr, buffer, sizeb); } /* * [Bug 2895565]. At this point 'size' still contains the number of - * bytes or characters which have been read. We keep this to later to + * characters which have been read. We keep this to later to * update the totals and toRead information, see marker (UP) below. We * must not overwrite it with 'sizeb', which is the number of written - * bytes or characters, and both EOL translation and encoding + * characters, and both EOL translation and encoding * conversion may have changed this number unpredictably in relation * to 'size' (It can be smaller or larger, in the latter case able to * drive toRead below -1, causing infinite looping). Completely @@ -9870,10 +9854,10 @@ CopyData( } /* - * Update the current byte count. Do it now so the count is valid + * Update the current character count. Do it now so the count is valid * before a return or break takes us out of the loop. The invariant at * the top of the loop should be that csPtr->toRead holds the number - * of bytes left to copy. + * of characters left to copy. */ if (csPtr->toRead != -1) { @@ -9940,8 +9924,8 @@ CopyData( } /* - * Make the callback or return the number of bytes transferred. The local - * total is used because StopCopy frees csPtr. + * Make the callback or return the number of characters transferred. The + * local total is used because StopCopy frees csPtr. */ total = csPtr->total; @@ -10264,6 +10248,50 @@ CopyEventProc( /* *---------------------------------------------------------------------- * + * Lossless -- + * + * Determines whether copying characters between two channel states would + * be lossless, i.e. whether one byte corresponds to one character, every + * character appears in the Unicode character set, there are no + * translations to be performed, and no inline signals to respond to. + * + * Result: + * True if copying would be lossless. + * + *---------------------------------------------------------------------- + */ +int +Lossless( + ChannelState *inStatePtr, + ChannelState *outStatePtr, + long long toRead) +{ + return inStatePtr->inEofChar == '\0' /* No eofChar to stop input */ + && inStatePtr->inputTranslation == TCL_TRANSLATE_LF + && outStatePtr->outputTranslation == TCL_TRANSLATE_LF + && ( + ( + (inStatePtr->encoding == NULL + || inStatePtr->encoding == GetBinaryEncoding() + ) + && + (outStatePtr->encoding == NULL + || outStatePtr->encoding == GetBinaryEncoding() + ) + ) + || + ( + toRead == -1 + && inStatePtr->encoding == outStatePtr->encoding + && CHANNEL_PROFILE_GET(inStatePtr->inputEncodingFlags) == TCL_ENCODING_PROFILE_TCL8 + && CHANNEL_PROFILE_GET(outStatePtr->inputEncodingFlags) == TCL_ENCODING_PROFILE_TCL8 + ) + ); +} + +/* + *---------------------------------------------------------------------- + * * StopCopy -- * * This routine halts a copy that is in progress. diff --git a/tests/chanio.test b/tests/chanio.test index 09e71ca..ccef3e2 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -6886,18 +6886,23 @@ test chan-io-52.11 {TclCopyChannel & encodings} -setup { puts $f АА close $f } -constraints {fcopy} -body { - # binary to encoding => the input has to be in utf-8 to make sense to the - # encoder set in [open $path(utf8-fcopy.txt) r] set out [open $path(kyrillic.txt) w] # -translation binary is also -encoding binary chan configure $in -translation binary - chan configure $out -encoding koi8-r -translation lf - chan copy $in $out - chan close $in - chan close $out - file size $path(kyrillic.txt) -} -result 3 + chan configure $out -encoding koi8-r -translation lf -profile strict + catch {chan copy $in $out} cres copts + return $cres +} -cleanup { + if {$in in [chan names]} { + close $in + } + if {$out in [chan names]} { + close $out + } + catch {unset cres} +} -match glob -result {error writing "*": invalid or incomplete\ + multibyte or wide character} test chan-io-53.1 {CopyData} -setup { file delete $path(test1) diff --git a/tests/io.test b/tests/io.test index f7589ff..f235aff 100644 --- a/tests/io.test +++ b/tests/io.test @@ -7519,26 +7519,27 @@ test io-52.10 {TclCopyChannel & encodings} {fcopy} { } 5 test io-52.11 {TclCopyChannel & encodings} -setup { set out [open $path(utf8-fcopy.txt) w] - fconfigure $out -encoding utf-8 -translation lf - puts $out "АА" + fconfigure $out -encoding utf-8 -translation lf -profile strict + puts $out АА close $out } -constraints {fcopy} -body { - # binary to encoding => the input has to be - # in utf-8 to make sense to the encoder - set in [open $path(utf8-fcopy.txt) r] set out [open $path(kyrillic.txt) w] - # -translation binary is also -encoding binary fconfigure $in -translation binary - fconfigure $out -encoding koi8-r -translation lf - - fcopy $in $out - close $in - close $out - - file size $path(kyrillic.txt) -} -result 3 + fconfigure $out -encoding koi8-r -translation lf -profile strict + catch {fcopy $in $out} cres copts + return $cres +} -cleanup { + if {$in in [chan names]} { + close $in + } + if {$out in [chan names]} { + close $out + } + catch {unset cres} +} -match glob -result {error writing "*": invalid or incomplete\ + multibyte or wide character} test io-52.12 {coverage of -translation auto} { file delete $path(test1) $path(test2) @@ -7780,6 +7781,29 @@ test io-52.23 {TclCopyChannel & encodings} -setup { unset ::s0 } -match glob -result {0 {error writing "file*": invalid or incomplete multibyte or wide character}} +test io-52.24 {fcopy -size should always be characters} -setup { + set out [open utf8-fcopy-52.24.txt w] + fconfigure $out -encoding utf-8 -translation lf + puts $out "Á" + close $out +} -constraints {fcopy} -body { + set in [open utf8-fcopy-52.24.txt r] + set out [open utf8-fcopy-52.24.out.txt w+] + + fconfigure $in -encoding utf-8 -profile tcl8 + fconfigure $out -encoding utf-8 -profile tcl8 + fcopy $in $out -size 1 + seek $out 0 + # a result of \xc3 means that only the first byte of the utf-8 encoding of + # Á made it into to the output file. + read $out +} -cleanup { + close $in + close $out + catch {file delete utf8-fcopy-52.24.txt} + catch {file delete utf8-fcopy-52.24.out.txt} +} -result Á + test io-53.1 {CopyData} {fcopy} { file delete $path(test1) -- cgit v0.12