From 45d0977a5ca8ae0ead48b66feeb51c65d1ce45b4 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 23 Oct 2017 17:06:57 +0000 Subject: Implementation branch for TIP 345: Kill the "identity" encoding. This checkin, completely does that. Bad news is it causes huge failures. We're still making a lot of use of this encoding, and cannot complete this TIP until the branch is repaired. --- generic/tclEncoding.c | 77 --------------------------------------------------- 1 file changed, 77 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index e328340..37d5fb6 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -195,11 +195,6 @@ static unsigned short emptyPage[256]; * Functions used only in this module. */ -static int BinaryProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); static void DupEncodingIntRep(Tcl_Obj *srcPtr, Tcl_Obj *dupPtr); static void EscapeFreeProc(ClientData clientData); static int EscapeFromUtfProc(ClientData clientData, @@ -563,14 +558,6 @@ TclInitEncodingSubsystem(void) * formed UTF-8 into a properly formed stream. */ - type.encodingName = "identity"; - type.toUtfProc = BinaryProc; - type.fromUtfProc = BinaryProc; - type.freeProc = NULL; - type.nullSize = 1; - type.clientData = NULL; - tclIdentityEncoding = Tcl_CreateEncoding(&type); - type.encodingName = "utf-8"; type.toUtfProc = UtfExtToUtfIntProc; type.fromUtfProc = UtfIntToUtfExtProc; @@ -2083,70 +2070,6 @@ LoadEscapeEncoding( /* *------------------------------------------------------------------------- * - * BinaryProc -- - * - * The default conversion when no other conversion is specified. No - * translation is done; source bytes are copied directly to destination - * bytes. - * - * Results: - * Returns TCL_OK if conversion was successful. - * - * Side effects: - * None. - * - *------------------------------------------------------------------------- - */ - -static int -BinaryProc( - ClientData clientData, /* Not used. */ - const char *src, /* Source string (unknown encoding). */ - int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ - char *dst, /* Output buffer in which converted string is - * stored. */ - int dstLen, /* The maximum length of output buffer in - * bytes. */ - int *srcReadPtr, /* Filled with the number of bytes from the - * source string that were converted. */ - int *dstWrotePtr, /* Filled with the number of bytes that were - * stored in the output buffer as a result of - * the conversion. */ - int *dstCharsPtr) /* Filled with the number of characters that - * correspond to the bytes stored in the - * output buffer. */ -{ - int result; - - result = TCL_OK; - dstLen -= TCL_UTF_MAX - 1; - if (dstLen < 0) { - dstLen = 0; - } - if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { - srcLen = *dstCharsPtr; - } - if (srcLen > dstLen) { - srcLen = dstLen; - result = TCL_CONVERT_NOSPACE; - } - - *srcReadPtr = srcLen; - *dstWrotePtr = srcLen; - *dstCharsPtr = srcLen; - memcpy(dst, src, (size_t) srcLen); - return result; -} - -/* - *------------------------------------------------------------------------- - * * UtfExtToUtfIntProc -- * * Convert from UTF-8 to UTF-8. While converting null-bytes from the -- cgit v0.12 From 319b71965b6c3a38ef712ac4e6b58fa7dae7e6f4 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 23 Oct 2017 17:58:07 +0000 Subject: backout initial commit; need more care. Binary writes internally make use of this encoding. Need to hide it instead of destroy it. --- generic/tclEncoding.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 37d5fb6..e328340 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -195,6 +195,11 @@ static unsigned short emptyPage[256]; * Functions used only in this module. */ +static int BinaryProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); static void DupEncodingIntRep(Tcl_Obj *srcPtr, Tcl_Obj *dupPtr); static void EscapeFreeProc(ClientData clientData); static int EscapeFromUtfProc(ClientData clientData, @@ -558,6 +563,14 @@ TclInitEncodingSubsystem(void) * formed UTF-8 into a properly formed stream. */ + type.encodingName = "identity"; + type.toUtfProc = BinaryProc; + type.fromUtfProc = BinaryProc; + type.freeProc = NULL; + type.nullSize = 1; + type.clientData = NULL; + tclIdentityEncoding = Tcl_CreateEncoding(&type); + type.encodingName = "utf-8"; type.toUtfProc = UtfExtToUtfIntProc; type.fromUtfProc = UtfIntToUtfExtProc; @@ -2070,6 +2083,70 @@ LoadEscapeEncoding( /* *------------------------------------------------------------------------- * + * BinaryProc -- + * + * The default conversion when no other conversion is specified. No + * translation is done; source bytes are copied directly to destination + * bytes. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +BinaryProc( + ClientData clientData, /* Not used. */ + const char *src, /* Source string (unknown encoding). */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + int result; + + result = TCL_OK; + dstLen -= TCL_UTF_MAX - 1; + if (dstLen < 0) { + dstLen = 0; + } + if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { + srcLen = *dstCharsPtr; + } + if (srcLen > dstLen) { + srcLen = dstLen; + result = TCL_CONVERT_NOSPACE; + } + + *srcReadPtr = srcLen; + *dstWrotePtr = srcLen; + *dstCharsPtr = srcLen; + memcpy(dst, src, (size_t) srcLen); + return result; +} + +/* + *------------------------------------------------------------------------- + * * UtfExtToUtfIntProc -- * * Convert from UTF-8 to UTF-8. While converting null-bytes from the -- cgit v0.12 From 1552384787fef03116569d8281bc0a69a02d6e3f Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 24 Oct 2017 23:16:23 +0000 Subject: Extend Tcl_CreateEncoding() to be able to create an encoding without registering it when it has no name. Then use this to create "identity" encoding without naming it "identity". Then no one can look it up by that name. --- generic/tclEncoding.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index e328340..f43c0e1 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -563,7 +563,7 @@ TclInitEncodingSubsystem(void) * formed UTF-8 into a properly formed stream. */ - type.encodingName = "identity"; + type.encodingName = NULL; type.toUtfProc = BinaryProc; type.fromUtfProc = BinaryProc; type.freeProc = NULL; @@ -851,7 +851,9 @@ FreeEncoding( if (encodingPtr->hPtr != NULL) { Tcl_DeleteHashEntry(encodingPtr->hPtr); } - ckfree(encodingPtr->name); + if (encodingPtr->name) { + ckfree(encodingPtr->name); + } ckfree(encodingPtr); } } @@ -1040,9 +1042,24 @@ Tcl_CreateEncoding( const Tcl_EncodingType *typePtr) /* The encoding type. */ { + Encoding *encodingPtr = ckalloc(sizeof(Encoding)); + encodingPtr->name = NULL; + encodingPtr->toUtfProc = typePtr->toUtfProc; + encodingPtr->fromUtfProc = typePtr->fromUtfProc; + encodingPtr->freeProc = typePtr->freeProc; + encodingPtr->nullSize = typePtr->nullSize; + encodingPtr->clientData = typePtr->clientData; + if (typePtr->nullSize == 1) { + encodingPtr->lengthProc = (LengthProc *) strlen; + } else { + encodingPtr->lengthProc = (LengthProc *) unilen; + } + encodingPtr->refCount = 1; + encodingPtr->hPtr = NULL; + + if (typePtr->encodingName) { Tcl_HashEntry *hPtr; int isNew; - Encoding *encodingPtr; char *name; Tcl_MutexLock(&encodingMutex); @@ -1058,25 +1075,12 @@ Tcl_CreateEncoding( } name = ckalloc(strlen(typePtr->encodingName) + 1); - - encodingPtr = ckalloc(sizeof(Encoding)); encodingPtr->name = strcpy(name, typePtr->encodingName); - encodingPtr->toUtfProc = typePtr->toUtfProc; - encodingPtr->fromUtfProc = typePtr->fromUtfProc; - encodingPtr->freeProc = typePtr->freeProc; - encodingPtr->nullSize = typePtr->nullSize; - encodingPtr->clientData = typePtr->clientData; - if (typePtr->nullSize == 1) { - encodingPtr->lengthProc = (LengthProc *) strlen; - } else { - encodingPtr->lengthProc = (LengthProc *) unilen; - } - encodingPtr->refCount = 1; encodingPtr->hPtr = hPtr; Tcl_SetHashValue(hPtr, encodingPtr); Tcl_MutexUnlock(&encodingMutex); - + } return (Tcl_Encoding) encodingPtr; } -- cgit v0.12 From 40d9b81e23c10ebc3abfe4ac12af411d01318fb9 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 24 Oct 2017 23:30:35 +0000 Subject: Stop using "identity" as an encoding to test basic functionng of the [encoding] command. "iso8859-1" is another one always available. --- tests/cmdAH.test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 3c58c1b..03f4d66 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -188,7 +188,7 @@ test cmdAH-4.5 {Tcl_EncodingObjCmd} -setup { test cmdAH-4.6 {Tcl_EncodingObjCmd} -setup { set system [encoding system] } -body { - encoding system identity + encoding system iso8859-1 encoding convertto jis0208 \u4e4e } -cleanup { encoding system $system @@ -210,7 +210,7 @@ test cmdAH-4.9 {Tcl_EncodingObjCmd} -setup { test cmdAH-4.10 {Tcl_EncodingObjCmd} -setup { set system [encoding system] } -body { - encoding system identity + encoding system iso8859-1 encoding convertfrom jis0208 8C } -cleanup { encoding system $system @@ -224,11 +224,11 @@ test cmdAH-4.12 {Tcl_EncodingObjCmd} -returnCodes error -body { test cmdAH-4.13 {Tcl_EncodingObjCmd} -setup { set system [encoding system] } -body { - encoding system identity + encoding system iso8859-1 encoding system } -cleanup { encoding system $system -} -result identity +} -result iso8859-1 test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body { file -- cgit v0.12 From eb2cfa3e6f2fc11362ae54a2004ade43a57c86de Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 25 Oct 2017 02:23:12 +0000 Subject: Repair Tcl_CreateEncoding(); Modernize [testencoding]; Update most tests toying with identity encoding. --- generic/tclEncoding.c | 4 ++-- generic/tclTest.c | 9 ++++++--- tests/encoding.test | 12 ++++-------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index f43c0e1..1f48a1c 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1070,8 +1070,8 @@ Tcl_CreateEncoding( * reference goes away. */ - encodingPtr = Tcl_GetHashValue(hPtr); - encodingPtr->hPtr = NULL; + Encoding *replaceMe = Tcl_GetHashValue(hPtr); + replaceMe->hPtr = NULL; } name = ckalloc(strlen(typePtr->encodingName) + 1); diff --git a/generic/tclTest.c b/generic/tclTest.c index ebd90ae..c455d42 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -1985,9 +1985,12 @@ TestencodingObjCmd( if (objc != 3) { return TCL_ERROR; } - encoding = Tcl_GetEncoding(NULL, Tcl_GetString(objv[2])); - Tcl_FreeEncoding(encoding); - Tcl_FreeEncoding(encoding); + if (TCL_OK != Tcl_GetEncodingFromObj(interp, objv[2], &encoding)) { + return TCL_ERROR; + } + Tcl_FreeEncoding(encoding); /* Free returned reference */ + Tcl_FreeEncoding(encoding); /* Free to match CREATE */ + TclFreeIntRep(objv[2]); /* Free the cached ref */ break; } return TCL_OK; diff --git a/tests/encoding.test b/tests/encoding.test index be1f4d5..0ee08b6 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -75,11 +75,11 @@ test encoding-2.2 {Tcl_FreeEncoding: refcount != 0} -setup { encoding system shiftjis ;# incr ref count encoding dirs [list [pwd]] set x [encoding convertto shiftjis \u4e4e] ;# old one found - encoding system identity + encoding system iso8859-1 llength shiftjis ;# Shimmer away any cache of Tcl_Encoding lappend x [catch {encoding convertto shiftjis \u4e4e} msg] $msg } -cleanup { - encoding system identity + encoding system iso8859-1 encoding dirs $path encoding system $system } -result "\u008c\u00c1 1 {unknown encoding \"shiftjis\"}" @@ -136,7 +136,7 @@ test encoding-5.1 {Tcl_SetSystemEncoding} -setup { encoding system jis0208 encoding convertto \u4e4e } -cleanup { - encoding system identity + encoding system iso8859-1 encoding system $old } -result {8C} test encoding-5.2 {Tcl_SetSystemEncoding: test ref count} { @@ -259,7 +259,7 @@ test encoding-11.5.1 {LoadEncodingFile: escape file} { test encoding-11.6 {LoadEncodingFile: invalid file} -constraints {testencoding} -setup { set system [encoding system] set path [encoding dirs] - encoding system identity + encoding system iso8859-1 } -body { cd [temporaryDirectory] encoding dirs [file join tmp encoding] @@ -308,10 +308,6 @@ test encoding-13.1 {LoadEscapeTable} { viewable [set x [encoding convertto iso2022 ab\u4e4e\u68d9g]] } [viewable "ab\x1b\$B8C\x1b\$\(DD%\x1b(Bg"] -test encoding-14.1 {BinaryProc} { - encoding convertto identity \x12\x34\x56\xff\x69 -} "\x12\x34\x56\xc3\xbf\x69" - test encoding-15.1 {UtfToUtfProc} { encoding convertto utf-8 \xa3 } "\xc2\xa3" -- cgit v0.12 From 30c6827ee17807b25e79676f049f66877bf3d4ff Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 25 Oct 2017 02:54:24 +0000 Subject: Convert remaining tests to use [testbytestring]. encoding-15.3 still needs replacement for [encoding convertto identity]. That is, some testing command to expose objPtr->bytes. --- tests/encoding.test | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/encoding.test b/tests/encoding.test index 0ee08b6..11f08ef 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -34,6 +34,7 @@ proc runtests {} { # Some tests require the testencoding command testConstraint testencoding [llength [info commands testencoding]] +testConstraint testbytestring [llength [info commands testbytestring]] testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}] testConstraint exec [llength [info commands exec]] testConstraint testgetencpath [llength [info commands testgetencpath]] @@ -311,15 +312,14 @@ test encoding-13.1 {LoadEscapeTable} { test encoding-15.1 {UtfToUtfProc} { encoding convertto utf-8 \xa3 } "\xc2\xa3" -test encoding-15.2 {UtfToUtfProc null character output} { +test encoding-15.2 {UtfToUtfProc null character output} testbytestring { set x \u0000 - set y [encoding convertto utf-8 \u0000] - set y [encoding convertfrom identity $y] + set y [testbytestring [encoding convertto utf-8 \u0000]] binary scan $y H* z list [string bytelength $x] [string bytelength $y] $z } {2 1 00} -test encoding-15.3 {UtfToUtfProc null character input} { - set x [encoding convertfrom identity \x00] +test encoding-15.3 {UtfToUtfProc null character input} testbytestring { + set x [testbytestring \x00] set y [encoding convertfrom utf-8 $x] binary scan [encoding convertto identity $y] H* z list [string bytelength $x] [string bytelength $y] $z -- cgit v0.12 From 053a74e0facac37a830c2c317c90d11bc13e5264 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 26 Oct 2017 02:08:27 +0000 Subject: Create new testing command [teststringbytes] to probe the things that otherwise require [encoding convertto identity]. adapt encoding-15.3 to use. --- generic/tclTest.c | 38 ++++++++++++++++++++++++++++++++++++++ tests/encoding.test | 20 +++++++++----------- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index c455d42..834cd79 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -227,6 +227,9 @@ static int TestasyncCmd(ClientData dummy, static int TestbytestringObjCmd(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); +static int TeststringbytesObjCmd(ClientData clientData, + Tcl_Interp *interp, int objc, + Tcl_Obj *const objv[]); static int TestcmdinfoCmd(ClientData dummy, Tcl_Interp *interp, int argc, const char **argv); static int TestcmdtokenCmd(ClientData dummy, @@ -581,6 +584,7 @@ Tcltest_Init( Tcl_CreateCommand(interp, "noop", NoopCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "noop", NoopObjCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testbytestring", TestbytestringObjCmd, NULL, NULL); + Tcl_CreateObjCommand(interp, "teststringbytes", TeststringbytesObjCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testwrongnumargs", TestWrongNumArgsObjCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testfilesystem", TestFilesystemObjCmd, @@ -5049,6 +5053,40 @@ NoopObjCmd( /* *---------------------------------------------------------------------- * + * TeststringbytesObjCmd -- + * Returns bytearray value of the bytes in argument string rep + * + * Results: + * Returns the TCL_OK result code. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +TeststringbytesObjCmd( + ClientData unused, /* Not used. */ + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* The argument objects. */ +{ + int n; + const unsigned char *p; + + if (objc != 2) { + Tcl_WrongNumArgs(interp, 1, objv, "value"); + return TCL_ERROR; + } + p = (const unsigned char *)Tcl_GetStringFromObj(objv[1], &n); + Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(p, n)); + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * * TestbytestringObjCmd -- * * This object-based procedure constructs a string which can diff --git a/tests/encoding.test b/tests/encoding.test index 11f08ef..e447c20 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -35,6 +35,7 @@ proc runtests {} { # Some tests require the testencoding command testConstraint testencoding [llength [info commands testencoding]] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint teststringbytes [llength [info commands teststringbytes]] testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}] testConstraint exec [llength [info commands exec]] testConstraint testgetencpath [llength [info commands testgetencpath]] @@ -313,17 +314,14 @@ test encoding-15.1 {UtfToUtfProc} { encoding convertto utf-8 \xa3 } "\xc2\xa3" test encoding-15.2 {UtfToUtfProc null character output} testbytestring { - set x \u0000 - set y [testbytestring [encoding convertto utf-8 \u0000]] - binary scan $y H* z - list [string bytelength $x] [string bytelength $y] $z -} {2 1 00} -test encoding-15.3 {UtfToUtfProc null character input} testbytestring { - set x [testbytestring \x00] - set y [encoding convertfrom utf-8 $x] - binary scan [encoding convertto identity $y] H* z - list [string bytelength $x] [string bytelength $y] $z -} {1 2 c080} + binary scan [testbytestring [encoding convertto utf-8 \u0000]] H* z + set z +} 00 +test encoding-15.3 {UtfToUtfProc null character input} teststringbytes { + set y [encoding convertfrom utf-8 [encoding convertto utf-8 \u0000]] + binary scan [teststringbytes $y] H* z + set z +} c080 test encoding-16.1 {UnicodeToUtfProc} { set val [encoding convertfrom unicode NN] -- cgit v0.12