diff options
-rw-r--r-- | ChangeLog | 11 | ||||
-rw-r--r-- | doc/binary.n | 26 | ||||
-rw-r--r-- | generic/tclBinary.c | 261 | ||||
-rw-r--r-- | tests/binary.test | 75 |
4 files changed, 302 insertions, 71 deletions
@@ -1,3 +1,14 @@ +2013-09-17 Donal Fellows <dkf@Donals-MacBook-Pro.local> + + * generic/tclBinary.c (BinaryEncodeUu, BinaryDecodeUu): [Bug 2152292]: + Corrected implementation of the core of uuencode handling so that the + line length processing is correctly applied. + ***POTENTIAL INCOMPATIBILITY*** + Existing code that was using the old versions and working around the + limitations will now need to do far less. The -maxlen option now has + strict limits on the range of supported lengths; this is a limitation + of the format itself. + 2013-09-09 Donal Fellows <dkf@users.sf.net> * generic/tclOOMethod.c (CloneProcedureMethod): [Bug 3609693]: Strip diff --git a/doc/binary.n b/doc/binary.n index a40afe6..cbbebd1 100644 --- a/doc/binary.n +++ b/doc/binary.n @@ -36,6 +36,13 @@ The \fBbinary encode\fR and \fBbinary decode\fR subcommands convert binary data to or from string encodings such as base64 (used in MIME messages for example). .VE 8.6 +.PP +Note that other operations on binary data, such as taking a subsequence of it, +getting its length, or reinterpreting it as a string in some encoding, are +done by other Tcl commands (respectively \fBstring range\fR, +\fBstring length\fR and \fBencoding convertfrom\fR in the example cases). A +binary string in Tcl is merely one where all the characters it contains are in +the range \eu0000\-\eu00FF. .SH "BINARY ENCODE AND DECODE" .VS 8.6 .PP @@ -95,13 +102,14 @@ between Unix systems and on USENET, but is less common these days, having been largely superseded by the \fBbase64\fR binary encoding. .RS .PP -During encoding, the following options are supported: -'\" This is wrong! The uuencode format had more complexity than this! +During encoding, the following options are supported (though changing them may +produce files that other implementations of decoders cannot process): .TP \fB\-maxlen \fIlength\fR . Indicates that the output should be split into lines of no more than -\fIlength\fR characters. By default, lines are not split. +\fIlength\fR characters. By default, lines are split every 61 characters, and +this must be in the range 3 to 85 due to limitations in the encoding. .TP \fB\-wrapchar \fIcharacter\fR . @@ -114,7 +122,11 @@ During decoding, the following options are supported: .TP \fB\-strict\fR . -Instructs the decoder to throw an error if it encounters whitespace characters. Otherwise it ignores them. +Instructs the decoder to throw an error if it encounters unexpected whitespace +characters. Otherwise it ignores them. +.PP +Note that neither the encoder nor the decoder handle the header and footer of +the uuencode format. .RE .VE 8.6 .SH "BINARY FORMAT" @@ -855,6 +867,7 @@ architectures, use their textual representation (as produced by .PP This is a procedure to write a Tcl string to a binary-encoded channel as UTF-8 data preceded by a length word: +.PP .CS proc \fIwriteString\fR {channel string} { set data [encoding convertto utf-8 $string] @@ -865,6 +878,7 @@ proc \fIwriteString\fR {channel string} { .PP This procedure reads a string from a channel that was written by the previously presented \fIwriteString\fR procedure: +.PP .CS proc \fIreadString\fR {channel} { if {![\fBbinary scan\fR [read $channel 4] I length]} { @@ -877,6 +891,7 @@ proc \fIreadString\fR {channel} { .PP This converts the contents of a file (named in the variable \fIfilename\fR) to base64 and prints them: +.PP .CS set f [open $filename rb] set data [read $f] @@ -884,9 +899,10 @@ close $f puts [\fBbinary encode\fR base64 \-maxlen 64 $data] .CE .SH "SEE ALSO" -format(n), scan(n), tcl_platform(n) +encoding(n), format(n), scan(n), string(n), tcl_platform(n) .SH KEYWORDS binary, format, scan '\" Local Variables: '\" mode: nroff +'\" fill-column: 78 '\" End: diff --git a/generic/tclBinary.c b/generic/tclBinary.c index 7625d39..58583f4 100644 --- a/generic/tclBinary.c +++ b/generic/tclBinary.c @@ -87,10 +87,13 @@ static int BinaryDecodeHex(ClientData clientData, static int BinaryEncode64(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); -static int BinaryDecodeUu(ClientData clientData, +static int BinaryDecode64(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); -static int BinaryDecode64(ClientData clientData, +static int BinaryEncodeUu(ClientData clientData, + Tcl_Interp *interp, int objc, + Tcl_Obj *const objv[]); +static int BinaryDecodeUu(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); @@ -139,9 +142,9 @@ static const EnsembleImplMap binaryMap[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; static const EnsembleImplMap encodeMap[] = { - { "hex", BinaryEncodeHex, TclCompileBasic1ArgCmd, NULL, (ClientData)HexDigits, 0 }, - { "uuencode", BinaryEncode64, NULL, NULL, (ClientData)UueDigits, 0 }, - { "base64", BinaryEncode64, NULL, NULL, (ClientData)B64Digits, 0 }, + { "hex", BinaryEncodeHex, TclCompileBasic1ArgCmd, NULL, NULL, 0 }, + { "uuencode", BinaryEncodeUu, NULL, NULL, NULL, 0 }, + { "base64", BinaryEncode64, NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; static const EnsembleImplMap decodeMap[] = { @@ -2312,7 +2315,6 @@ BinaryEncodeHex( Tcl_Obj *resultObj = NULL; unsigned char *data = NULL; unsigned char *cursor = NULL; - const char *digits = clientData; int offset = 0, count = 0; if (objc != 2) { @@ -2324,8 +2326,8 @@ BinaryEncodeHex( data = Tcl_GetByteArrayFromObj(objv[1], &count); cursor = Tcl_SetByteArrayLength(resultObj, count * 2); for (offset = 0; offset < count; ++offset) { - *cursor++ = digits[((data[offset] >> 4) & 0x0f)]; - *cursor++ = digits[(data[offset] & 0x0f)]; + *cursor++ = HexDigits[((data[offset] >> 4) & 0x0f)]; + *cursor++ = HexDigits[(data[offset] & 0x0f)]; } Tcl_SetObjResult(interp, resultObj); return TCL_OK; @@ -2439,7 +2441,7 @@ BinaryDecodeHex( * This implements a generic 6 bit binary encoding. Input is broken into * 6 bit chunks and a lookup table passed in via clientData is used to * turn these values into output characters. This is used to implement - * base64 and uuencode binary encodings. + * base64 binary encodings. * * Results: * Interp result set to an encoded byte array object @@ -2475,7 +2477,6 @@ BinaryEncode64( { Tcl_Obj *resultObj; unsigned char *data, *cursor, *limit; - const char *digits = clientData; int maxlen = 0; const char *wrapchar = "\n"; int wrapcharlen = 1; @@ -2498,6 +2499,12 @@ BinaryEncode64( if (Tcl_GetIntFromObj(interp, objv[i+1], &maxlen) != TCL_OK) { return TCL_ERROR; } + if (maxlen < 0) { + Tcl_SetResult(interp, "line length out of range", TCL_STATIC); + Tcl_SetErrorCode(interp, "TCL", "BINARY", "ENCODE", + "LINE_LENGTH", NULL); + return TCL_ERROR; + } break; case OPT_WRAPCHAR: wrapchar = Tcl_GetStringFromObj(objv[i+1], &wrapcharlen); @@ -2528,17 +2535,17 @@ BinaryEncode64( for (i = 0; i < 3 && offset+i < count; ++i) { d[i] = data[offset + i]; } - OUTPUT(digits[d[0] >> 2]); - OUTPUT(digits[((d[0] & 0x03) << 4) | (d[1] >> 4)]); + OUTPUT(B64Digits[d[0] >> 2]); + OUTPUT(B64Digits[((d[0] & 0x03) << 4) | (d[1] >> 4)]); if (offset+1 < count) { - OUTPUT(digits[((d[1] & 0x0f) << 2) | (d[2] >> 6)]); + OUTPUT(B64Digits[((d[1] & 0x0f) << 2) | (d[2] >> 6)]); } else { - OUTPUT(digits[64]); + OUTPUT(B64Digits[64]); } if (offset+2 < count) { - OUTPUT(digits[d[2] & 0x3f]); + OUTPUT(B64Digits[d[2] & 0x3f]); } else { - OUTPUT(digits[64]); + OUTPUT(B64Digits[64]); } } } @@ -2550,6 +2557,124 @@ BinaryEncode64( /* *---------------------------------------------------------------------- * + * BinaryEncodeUu -- + * + * This implements the uuencode binary encoding. Input is broken into 6 + * bit chunks and a lookup table is used to turn these values into output + * characters. This differs from the generic code above in that line + * lengths are also encoded. + * + * Results: + * Interp result set to an encoded byte array object + * + * Side effects: + * None + * + *---------------------------------------------------------------------- + */ + +static int +BinaryEncodeUu( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + Tcl_Obj *resultObj; + unsigned char *data, *start, *cursor; + int offset, count, rawLength, n, i, j, bits, index; + int lineLength = 61; + const unsigned char SingleNewline[] = { (unsigned char) '\n' }; + const unsigned char *wrapchar = SingleNewline; + int wrapcharlen = sizeof(SingleNewline); + enum { OPT_MAXLEN, OPT_WRAPCHAR }; + static const char *const optStrings[] = { "-maxlen", "-wrapchar", NULL }; + + if (objc < 2 || objc%2 != 0) { + Tcl_WrongNumArgs(interp, 1, objv, + "?-maxlen len? ?-wrapchar char? data"); + return TCL_ERROR; + } + for (i = 1; i < objc-1; i += 2) { + if (Tcl_GetIndexFromObj(interp, objv[i], optStrings, "option", + TCL_EXACT, &index) != TCL_OK) { + return TCL_ERROR; + } + switch (index) { + case OPT_MAXLEN: + if (Tcl_GetIntFromObj(interp, objv[i+1], &lineLength) != TCL_OK) { + return TCL_ERROR; + } + if (lineLength < 3 || lineLength > 85) { + Tcl_SetResult(interp, "line length out of range", TCL_STATIC); + Tcl_SetErrorCode(interp, "TCL", "BINARY", "ENCODE", + "LINE_LENGTH", NULL); + return TCL_ERROR; + } + break; + case OPT_WRAPCHAR: + wrapchar = Tcl_GetByteArrayFromObj(objv[i+1], &wrapcharlen); + break; + } + } + + /* + * Allocate the buffer. This is a little bit too long, but is "good + * enough". + */ + + resultObj = Tcl_NewObj(); + offset = 0; + data = Tcl_GetByteArrayFromObj(objv[objc-1], &count); + rawLength = (lineLength - 1) * 3 / 4; + start = cursor = Tcl_SetByteArrayLength(resultObj, + (lineLength + wrapcharlen) * + ((count + (rawLength - 1)) / rawLength)); + n = bits = 0; + + /* + * Encode the data. Each output line first has the length of raw data + * encoded by the output line described in it by one encoded byte, then + * the encoded data follows (encoding each 6 bits as one character). + * Encoded lines are always terminated by a newline. + */ + + while (offset < count) { + int lineLen = count - offset; + + if (lineLen > rawLength) { + lineLen = rawLength; + } + *cursor++ = UueDigits[lineLen]; + for (i=0 ; i<lineLen ; i++) { + n <<= 8; + n |= data[offset++]; + for (bits += 8; bits > 6 ; bits -= 6) { + *cursor++ = UueDigits[(n >> (bits-6)) & 0x3f]; + } + } + if (bits > 0) { + n <<= 8; + *cursor++ = UueDigits[(n >> (bits + 2)) & 0x3f]; + bits = 0; + } + for (j=0 ; j<wrapcharlen ; ++j) { + *cursor++ = wrapchar[j]; + } + } + + /* + * Fix the length of the output bytearray. + */ + + Tcl_SetByteArrayLength(resultObj, cursor-start); + Tcl_SetObjResult(interp, resultObj); + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * * BinaryDecodeUu -- * * Decode a uuencoded string. @@ -2573,8 +2698,8 @@ BinaryDecodeUu( Tcl_Obj *resultObj = NULL; unsigned char *data, *datastart, *dataend; unsigned char *begin, *cursor; - int i, index, size, count = 0, cut = 0, strict = 0; - char c; + int i, index, size, count = 0, strict = 0, lineLen; + unsigned char c; enum {OPT_STRICT }; static const char *const optStrings[] = { "-strict", NULL }; @@ -2600,44 +2725,112 @@ BinaryDecodeUu( dataend = data + count; size = ((count + 3) & ~3) * 3 / 4; begin = cursor = Tcl_SetByteArrayLength(resultObj, size); + lineLen = -1; + + /* + * The decoding loop. First, we get the length of line (strictly, the + * number of data bytes we expect to generate from the line) we're + * processing this time round if it is not already known (i.e., when the + * lineLen variable is set to the magic value, -1). + */ + while (data < dataend) { char d[4] = {0, 0, 0, 0}; + if (lineLen < 0) { + c = *data++; + if (c < 32 || c > 96) { + if (strict || !isspace(c)) { + goto badUu; + } + i--; + continue; + } + lineLen = (c - 32) & 0x3f; + } + + /* + * Now we read a four-character grouping. + */ + for (i=0 ; i<4 ; i++) { if (data < dataend) { d[i] = c = *data++; - if (c < 33 || c > 96) { - if (strict || !isspace(UCHAR(c))) { - goto badUu; + if (c < 32 || c > 96) { + if (strict) { + if (!isspace(c)) { + goto badUu; + } else if (c == '\n') { + goto shortUu; + } } i--; continue; } - } else { - cut++; } } - if (cut > 3) { - cut = 3; + + /* + * Translate that grouping into (up to) three binary bytes output. + */ + + if (lineLen > 0) { + *cursor++ = (((d[0] - 0x20) & 0x3f) << 2) + | (((d[1] - 0x20) & 0x3f) >> 4); + if (--lineLen > 0) { + *cursor++ = (((d[1] - 0x20) & 0x3f) << 4) + | (((d[2] - 0x20) & 0x3f) >> 2); + if (--lineLen > 0) { + *cursor++ = (((d[2] - 0x20) & 0x3f) << 6) + | (((d[3] - 0x20) & 0x3f)); + lineLen--; + } + } + } + + /* + * If we've reached the end of the line, skip until we process a + * newline. + */ + + if (lineLen == 0 && data < dataend) { + lineLen = -1; + do { + c = *data++; + if (c == '\n') { + break; + } else if (c >= 32 && c <= 96) { + data--; + break; + } else if (strict || !isspace(c)) { + goto badUu; + } + } while (data < dataend); } - *cursor++ = (((d[0] - 0x20) & 0x3f) << 2) - | (((d[1] - 0x20) & 0x3f) >> 4); - *cursor++ = (((d[1] - 0x20) & 0x3f) << 4) - | (((d[2] - 0x20) & 0x3f) >> 2); - *cursor++ = (((d[2] - 0x20) & 0x3f) << 6) - | (((d[3] - 0x20) & 0x3f)); } - if (cut > size) { - cut = size; + + /* + * Sanity check, clean up and finish. + */ + + if (lineLen > 0 && strict) { + goto shortUu; } - Tcl_SetByteArrayLength(resultObj, cursor - begin - cut); + Tcl_SetByteArrayLength(resultObj, cursor - begin); Tcl_SetObjResult(interp, resultObj); return TCL_OK; + shortUu: + Tcl_SetObjResult(interp, Tcl_ObjPrintf("short uuencode data")); + Tcl_SetErrorCode(interp, "TCL", "BINARY", "DECODE", "SHORT", NULL); + TclDecrRefCount(resultObj); + return TCL_ERROR; + badUu: Tcl_SetObjResult(interp, Tcl_ObjPrintf( "invalid uuencode character \"%c\" at position %d", c, (int) (data - datastart - 1))); + Tcl_SetErrorCode(interp, "TCL", "BINARY", "DECODE", "INVALID", NULL); TclDecrRefCount(resultObj); return TCL_ERROR; } diff --git a/tests/binary.test b/tests/binary.test index d424837..40b1315 100644 --- a/tests/binary.test +++ b/tests/binary.test @@ -2705,105 +2705,116 @@ test binary-74.1 {binary encode uuencode} -body { } -returnCodes error -match glob -result "wrong # args: *" test binary-74.2 {binary encode uuencode} -body { binary encode uuencode abc -} -result {86)C} +} -result {#86)C +} test binary-74.3 {binary encode uuencode} -body { binary encode uuencode {} } -result {} test binary-74.4 {binary encode uuencode} -body { binary encode uuencode [string repeat abc 20] -} -result [string repeat 86)C 20] +} -result "M[string repeat 86)C 15]\n/[string repeat 86)C 5]\n" test binary-74.5 {binary encode uuencode} -body { binary encode uuencode \0\1\2\3\4\0\1\2\3 -} -result "``\$\"`P0``0(#" +} -result ")``\$\"`P0``0(#\n" test binary-74.6 {binary encode uuencode} -body { binary encode uuencode \0 -} -result {````} +} -result {!`` +} test binary-74.7 {binary encode uuencode} -body { binary encode uuencode \0\0 -} -result {````} +} -result "\"``` +" test binary-74.8 {binary encode uuencode} -body { binary encode uuencode \0\0\0 -} -result {````} +} -result {#```` +} test binary-74.9 {binary encode uuencode} -body { binary encode uuencode \0\0\0\0 -} -result {````````} -test binary-74.10 {binary encode uuencode} -body { - binary encode uuencode -maxlen 0 -wrapchar | abcabcabc -} -result {86)C86)C86)C} -test binary-74.11 {binary encode uuencode} -body { - binary encode uuencode -maxlen 1 -wrapchar | abcabcabc -} -result {8|6|)|C|8|6|)|C|8|6|)|C} +} -result {$`````` +} +test binary-74.10 {binary encode uuencode} -returnCodes error -body { + binary encode uuencode -foo 30 abcabcabc +} -result {bad option "-foo": must be -maxlen or -wrapchar} +test binary-74.11 {binary encode uuencode} -returnCodes error -body { + binary encode uuencode -maxlen 1 abcabcabc +} -result {line length out of range} +test binary-74.12 {binary encode uuencode} -body { + binary encode uuencode -maxlen 3 -wrapchar | abcabcabc +} -result {!80|!8@|!8P|!80|!8@|!8P|!80|!8@|!8P|} test binary-75.1 {binary decode uuencode} -body { binary decode uuencode } -returnCodes error -match glob -result "wrong # args: *" test binary-75.2 {binary decode uuencode} -body { - binary decode uuencode 86)C + binary decode uuencode "#86)C\n" } -result {abc} test binary-75.3 {binary decode uuencode} -body { binary decode uuencode {} } -result {} +test binary-75.3.1 {binary decode uuencode} -body { + binary decode uuencode `\n +} -result {} test binary-75.4 {binary decode uuencode} -body { - binary decode uuencode [string repeat "86)C" 20] + binary decode uuencode "M[string repeat 86)C 15]\n/[string repeat 86)C 5]\n" } -result [string repeat abc 20] test binary-75.5 {binary decode uuencode} -body { - binary decode uuencode "``\$\"`P0``0(#" + binary decode uuencode ")``\$\"`P0``0(#" } -result "\0\1\2\3\4\0\1\2\3" test binary-75.6 {binary decode uuencode} -body { - string length [binary decode uuencode {`}] + string length [binary decode uuencode "`\n"] } -result 0 test binary-75.7 {binary decode uuencode} -body { - string length [binary decode uuencode {``}] + string length [binary decode uuencode "!`\n"] } -result 1 test binary-75.8 {binary decode uuencode} -body { - string length [binary decode uuencode {```}] + string length [binary decode uuencode "\"``\n"] } -result 2 test binary-75.9 {binary decode uuencode} -body { - string length [binary decode uuencode {````}] + string length [binary decode uuencode "#```\n"] } -result 3 test binary-75.10 {binary decode uuencode} -body { - set s "[string repeat 86)C 10]\n[string repeat 86)C 10]" + set s ">[string repeat 86)C 10]\n>[string repeat 86)C 10]" binary decode uuencode $s } -result [string repeat abc 20] test binary-75.11 {binary decode uuencode} -body { - set s "[string repeat 86)C 10]\n [string repeat 86)C 10]" + set s ">[string repeat 86)C 10]\n\t>\t[string repeat 86)C 10]\r" binary decode uuencode $s } -result [string repeat abc 20] test binary-75.12 {binary decode uuencode} -body { binary decode uuencode -strict "|86)C" } -returnCodes error -match glob -result {invalid uuencode character "|" at position 0} test binary-75.13 {binary decode uuencode} -body { - set s "[string repeat 86)C 10]|[string repeat 86)C 10]" + set s ">[string repeat 86)C 10]|[string repeat 86)C 10]" binary decode uuencode -strict $s -} -returnCodes error -match glob -result {invalid uuencode character "|" at position 40} +} -returnCodes error -match glob -result {invalid uuencode character "|" at position 41} test binary-75.14 {binary decode uuencode} -body { - set s "[string repeat 86)C 10]\n [string repeat 86)C 10]" + set s ">[string repeat 86)C 10]\na[string repeat 86)C 10]" binary decode uuencode -strict $s } -returnCodes error -match glob -result {invalid uuencode character *} test binary-75.20 {binary decode uuencode} -body { - set r [binary decode uuencode 8] + set r [binary decode uuencode " 8"] list [string length $r] $r } -result {0 {}} test binary-75.21 {binary decode uuencode} -body { - set r [binary decode uuencode 86] + set r [binary decode uuencode "!86"] list [string length $r] $r } -result {1 a} test binary-75.22 {binary decode uuencode} -body { - set r [binary decode uuencode 86)] + set r [binary decode uuencode "\"86)"] list [string length $r] $r } -result {2 ab} test binary-75.23 {binary decode uuencode} -body { - set r [binary decode uuencode 86)C] + set r [binary decode uuencode "#86)C"] list [string length $r] $r } -result {3 abc} test binary-75.24 {binary decode uuencode} -body { - set s "04)\# " + set s "#04)\# " binary decode uuencode $s } -result ABC test binary-75.25 {binary decode uuencode} -body { - set s "04)\#z" + set s "#04)\#z" binary decode uuencode $s -} -returnCodes error -match glob -result {invalid uuencode character "z" at position 4} +} -returnCodes error -match glob -result {invalid uuencode character "z" at position 5} test binary-75.26 {binary decode uuencode} -body { string length [binary decode uuencode " "] } -result 0 |