diff options
| -rw-r--r-- | generic/tcl.decls | 14 | ||||
| -rw-r--r-- | generic/tcl.h | 28 | ||||
| -rw-r--r-- | generic/tclCmdAH.c | 426 | ||||
| -rw-r--r-- | generic/tclDecls.h | 28 | ||||
| -rw-r--r-- | generic/tclEncoding.c | 666 | ||||
| -rw-r--r-- | generic/tclIO.c | 149 | ||||
| -rw-r--r-- | generic/tclIO.h | 5 | ||||
| -rw-r--r-- | generic/tclInt.h | 14 | ||||
| -rw-r--r-- | generic/tclUtil.c | 9 | ||||
| -rw-r--r-- | generic/tclZlib.c | 8 | ||||
| -rw-r--r-- | unix/tclUnixChan.c | 4 | ||||
| -rw-r--r-- | unix/tclUnixFCmd.c | 46 | ||||
| -rw-r--r-- | unix/tclUnixFile.c | 18 | ||||
| -rw-r--r-- | unix/tclUnixInit.c | 2 | ||||
| -rw-r--r-- | win/tclWinSock.c | 4 |
15 files changed, 889 insertions, 532 deletions
diff --git a/generic/tcl.decls b/generic/tcl.decls index 3778de6..1608a88 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2500,13 +2500,17 @@ declare 656 { declare 657 { int Tcl_UniCharIsUnicode(int ch) } + +# TIP 656 declare 658 { - Tcl_Size Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr) -} + int Tcl_ExternalToUtfDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, + const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr) +} declare 659 { - Tcl_Size Tcl_UtfToExternalDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr) + int Tcl_UtfToExternalDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, + const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr) } # TIP #511 diff --git a/generic/tcl.h b/generic/tcl.h index fa4da26..6040099 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -1948,14 +1948,8 @@ typedef struct Tcl_EncodingType { * 0x00. Only valid for "utf-8" and "cesu-8". * This flag is implicit for external -> internal conversions, * optional for internal -> external conversions. - * TCL_ENCODING_NOCOMPLAIN - If set, the converter - * substitutes the problematic character(s) with - * one or more "close" characters in the - * destination buffer and then continues to - * convert the source. If clear, the converter returns - * immediately upon encountering an invalid byte sequence - * or a source character that has no mapping in the - * target encoding. Only for Tcl 9.x. + * TCL_ENCODING_PROFILE_* - Mutually exclusive encoding profile ids. Note + * these are bit masks. */ #define TCL_ENCODING_START 0x01 @@ -1970,7 +1964,23 @@ typedef struct Tcl_EncodingType { #define TCL_ENCODING_NO_TERMINATE 0x08 #define TCL_ENCODING_CHAR_LIMIT 0x10 #define TCL_ENCODING_MODIFIED 0x20 -#define TCL_ENCODING_NOCOMPLAIN 0x40 +/* Reserve top byte for profile values (disjoint) */ +#define TCL_ENCODING_PROFILE_TCL8 0x01000000 +#define TCL_ENCODING_PROFILE_STRICT 0x02000000 +#define TCL_ENCODING_PROFILE_REPLACE 0x03000000 +#define TCL_ENCODING_PROFILE_MASK 0xFF000000 +#define TCL_ENCODING_PROFILE_GET(flags_) ((flags_) & TCL_ENCODING_PROFILE_MASK) +#define TCL_ENCODING_PROFILE_SET(flags_, profile_) \ + do { \ + (flags_) &= ~TCL_ENCODING_PROFILE_MASK; \ + (flags_) |= profile_; \ + } while (0) +/* Still being argued - For Tcl9, is the default strict? TODO */ +#if TCL_MAJOR_VERSION < 9 +#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 +#else +#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? REPLACE? TODO */ +#endif /* * The following definitions are the error codes returned by the conversion diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 4df1216..c60a077 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -51,6 +51,7 @@ static Tcl_ObjCmdProc EncodingConvertfromObjCmd; static Tcl_ObjCmdProc EncodingConverttoObjCmd; static Tcl_ObjCmdProc EncodingDirsObjCmd; static Tcl_ObjCmdProc EncodingNamesObjCmd; +static Tcl_ObjCmdProc EncodingProfilesObjCmd; static Tcl_ObjCmdProc EncodingSystemObjCmd; static inline int ForeachAssignments(Tcl_Interp *interp, struct ForeachState *statePtr); @@ -386,6 +387,7 @@ TclInitEncodingCmd( {"convertto", EncodingConverttoObjCmd, TclCompileBasic1To3ArgCmd, NULL, NULL, 0}, {"dirs", EncodingDirsObjCmd, TclCompileBasic0Or1ArgCmd, NULL, NULL, 1}, {"names", EncodingNamesObjCmd, TclCompileBasic0ArgCmd, NULL, NULL, 0}, + {"profiles", EncodingProfilesObjCmd, TclCompileBasic0ArgCmd, NULL, NULL, 0}, {"system", EncodingSystemObjCmd, TclCompileBasic0Or1ArgCmd, NULL, NULL, 1}, {NULL, NULL, NULL, NULL, NULL, 0} }; @@ -394,6 +396,121 @@ TclInitEncodingCmd( } /* + *------------------------------------------------------------------------ + * + * EncodingConvertParseOptions -- + * + * Common routine for parsing arguments passed to encoding convertfrom + * and encoding convertto. + * + * Results: + * TCL_OK or TCL_ERROR. + * + * Side effects: + * On success, + * - *encPtr is set to the encoding. Must be freed with Tcl_FreeEncoding + * if non-NULL + * - *dataObjPtr is set to the Tcl_Obj containing the data to encode or + * decode + * - *profilePtr is set to encoding error handling profile + * - *failVarPtr is set to -failindex option value or NULL + * On error, all of the above are uninitialized. + * + *------------------------------------------------------------------------ + */ +static int +EncodingConvertParseOptions ( + Tcl_Interp *interp, /* For error messages. May be NULL */ + int objc, /* Number of arguments */ + Tcl_Obj *const objv[], /* Argument objects as passed to command. */ + Tcl_Encoding *encPtr, /* Where to store the encoding */ + Tcl_Obj **dataObjPtr, /* Where to store ptr to Tcl_Obj containing data */ + int *profilePtr, /* Bit mask of encoding option profile */ + Tcl_Obj **failVarPtr /* Where to store -failindex option value */ +) +{ + static const char *const options[] = {"-profile", "-failindex", NULL}; + enum convertfromOptions { PROFILE, FAILINDEX } optIndex; + Tcl_Encoding encoding; + Tcl_Obj *dataObj; + Tcl_Obj *failVarObj; +#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) + int profile = TCL_ENCODING_PROFILE_TCL8; /* TODO - default for Tcl9? */ +#else + int profile = TCL_ENCODING_PROFILE_TCL8; +#endif + + /* + * Possible combinations: + * 1) data -> objc = 2 + * 2) ?options? encoding data -> objc >= 3 + * It is intentional that specifying option forces encoding to be + * specified. Less prone to user error. This should have always been + * the case even in 8.6 imho where there were no options (ie (1) + * should never have been allowed) + */ + + if (objc == 1) { +numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ + Tcl_WrongNumArgs(interp, + 1, + objv, + "?-profile profile? ?-failindex var? encoding data"); + ((Interp *)interp)->flags |= INTERP_ALTERNATE_WRONG_ARGS; + Tcl_WrongNumArgs(interp, 1, objv, "data"); + return TCL_ERROR; + } + + failVarObj = NULL; + if (objc == 2) { + encoding = Tcl_GetEncoding(interp, NULL); + dataObj = objv[1]; + } else { + int argIndex; + for (argIndex = 1; argIndex < (objc-2); ++argIndex) { + if (Tcl_GetIndexFromObj( + interp, objv[argIndex], options, "option", 0, &optIndex) + != TCL_OK) { + return TCL_ERROR; + } + if (++argIndex == (objc - 2)) { + goto numArgsError; + } + switch (optIndex) { + case PROFILE: + if (TclEncodingProfileNameToId( + interp, Tcl_GetString(objv[argIndex]), &profile) + != TCL_OK) { + return TCL_ERROR; + } +#ifdef NOTNEEDED + /* TODO - next line probably not needed as the conversion + functions already take care of mapping profile to flags */ + profile = TclEncodingExternalFlagsToInternal(profile); +#endif + break; + case FAILINDEX: + failVarObj = objv[argIndex]; + break; + } + } + /* Get encoding after opts so no need to free it on option error */ + if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) + != TCL_OK) { + return TCL_ERROR; + } + dataObj = objv[objc - 1]; + } + + *encPtr = encoding; + *dataObjPtr = dataObj; + *profilePtr = profile; + *failVarPtr = failVarObj; + + return TCL_OK; +} + +/* *---------------------------------------------------------------------- * * EncodingConvertfromObjCmd -- @@ -419,113 +536,65 @@ EncodingConvertfromObjCmd( Tcl_Encoding encoding; /* Encoding to use */ size_t length = 0; /* Length of the byte array being converted */ const char *bytesPtr; /* Pointer to the first byte of the array */ - int flags = 0; - size_t result; - Tcl_Obj *failVarObj = NULL; - /* - * Decode parameters: - * Possible combinations: - * 1) data -> objc = 2 - * 2) encoding data -> objc = 3 - * 3) -nocomplain data -> objc = 3 - * 4) -nocomplain encoding data -> objc = 4 - * 5) -strict data -> objc = 3 - * 6) -strict encoding data -> objc = 4 - * 7) -failindex val data -> objc = 4 - * 8) -failindex val encoding data -> objc = 5 - */ + int flags; + int result; + Tcl_Obj *failVarObj; + Tcl_Size errorLocation; - if (objc == 2) { - encoding = Tcl_GetEncoding(interp, NULL); - data = objv[1]; - } else if (objc > 2 && objc < 7) { - int objcUnprocessed = objc; - data = objv[objc - 1]; - bytesPtr = Tcl_GetString(objv[1]); - if (bytesPtr[0] == '-' && bytesPtr[1] == 'n' - && !strncmp(bytesPtr, "-nocomplain", strlen(bytesPtr))) { - flags = TCL_ENCODING_NOCOMPLAIN; - objcUnprocessed--; - } else if (bytesPtr[0] == '-' && bytesPtr[1] == 's' - && !strncmp(bytesPtr, "-strict", strlen(bytesPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed--; - bytesPtr = Tcl_GetString(objv[2]); - if (bytesPtr[0] == '-' && bytesPtr[1] == 'f' - && !strncmp(bytesPtr, "-failindex", strlen(bytesPtr))) { - /* at least two additional arguments needed */ - if (objc < 6) { - goto encConvFromError; - } - failVarObj = objv[3]; - objcUnprocessed -= 2; - } - } else if (bytesPtr[0] == '-' && bytesPtr[1] == 'f' - && !strncmp(bytesPtr, "-failindex", strlen(bytesPtr))) { - /* at least two additional arguments needed */ - if (objc < 4) { - goto encConvFromError; - } - failVarObj = objv[2]; - flags = ENCODING_FAILINDEX; - objcUnprocessed -= 2; - bytesPtr = Tcl_GetString(objv[3]); - if (bytesPtr[0] == '-' && bytesPtr[1] == 's' - && !strncmp(bytesPtr, "-strict", strlen(bytesPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed --; - } - } - switch (objcUnprocessed) { - case 3: - if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { - return TCL_ERROR; - } - break; - case 2: - encoding = Tcl_GetEncoding(interp, NULL); - break; - default: - goto encConvFromError; - } - } else { - encConvFromError: - Tcl_WrongNumArgs(interp, 1, objv, "?-strict? ?-failindex var? ?encoding? data"); - ((Interp *) interp)->flags |= INTERP_ALTERNATE_WRONG_ARGS; - Tcl_WrongNumArgs(interp, 1, objv, "-nocomplain ?encoding? data"); + if (EncodingConvertParseOptions( + interp, objc, objv, &encoding, &data, &flags, &failVarObj) + != TCL_OK) { return TCL_ERROR; } /* - * Convert the string into a byte array in 'ds' + * Convert the string into a byte array in 'ds'. */ bytesPtr = (char *) Tcl_GetBytesFromObj(interp, data, &length); + if (bytesPtr == NULL) { return TCL_ERROR; } - result = Tcl_ExternalToUtfDStringEx(encoding, bytesPtr, length, - flags, &ds); - if ((!(flags & TCL_ENCODING_NOCOMPLAIN) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) && (result != TCL_INDEX_NONE)) { - if (failVarObj != NULL) { - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { - return TCL_ERROR; - } - } else { - char buf[TCL_INTEGER_SPACE]; - sprintf(buf, "%" TCL_Z_MODIFIER "u", result); - Tcl_SetObjResult(interp, Tcl_ObjPrintf("unexpected byte sequence starting at index %" - TCL_Z_MODIFIER "u: '\\x%X'", result, UCHAR(bytesPtr[result]))); - Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", - buf, NULL); - Tcl_DStringFree(&ds); + result = Tcl_ExternalToUtfDStringEx(interp, encoding, bytesPtr, length, flags, + &ds, failVarObj ? &errorLocation : NULL); + /* NOTE: ds must be freed beyond this point even on error */ + switch (result) { + case TCL_OK: + errorLocation = TCL_INDEX_NONE; + break; + case TCL_ERROR: + /* Error in parameters. Should not happen. interp will have error */ + Tcl_DStringFree(&ds); + return TCL_ERROR; + default: + /* + * One of the TCL_CONVERT_* errors. If we were not interested in the + * error location, interp result would already have been filled in + * and we can just return the error. Otherwise, we have to return + * what could be decoded and the returned error location. + */ + if (failVarObj == NULL) { + Tcl_DStringFree(&ds); return TCL_ERROR; } - } else if (failVarObj != NULL) { - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewIntObj(-1), TCL_LEAVE_ERR_MSG) == NULL) { + break; + } + + /* + * TCL_OK or a TCL_CONVERT_* error where the caller wants back as much + * data as was converted. + */ + if (failVarObj) { + /* I hope, wide int will cover Tcl_Size data type */ + if (Tcl_ObjSetVar2(interp, + failVarObj, + NULL, + Tcl_NewWideIntObj(errorLocation), + TCL_LEAVE_ERR_MSG) == NULL) { + Tcl_DStringFree(&ds); return TCL_ERROR; } } - /* * Note that we cannot use Tcl_DStringResult here because it will * truncate the string at the first null byte. @@ -533,9 +602,7 @@ EncodingConvertfromObjCmd( Tcl_SetObjResult(interp, Tcl_DStringToObj(&ds)); - /* - * We're done with the encoding - */ + /* We're done with the encoding */ Tcl_FreeEncoding(encoding); return TCL_OK; @@ -568,80 +635,14 @@ EncodingConverttoObjCmd( Tcl_Encoding encoding; /* Encoding to use */ size_t length; /* Length of the string being converted */ const char *stringPtr; /* Pointer to the first byte of the string */ - size_t result; - int flags = 0; - Tcl_Obj *failVarObj = NULL; - - /* - * Decode parameters: - * Possible combinations: - * 1) data -> objc = 2 - * 2) encoding data -> objc = 3 - * 3) -nocomplain data -> objc = 3 - * 4) -nocomplain encoding data -> objc = 4 - * 5) -failindex val data -> objc = 4 - * 6) -failindex val encoding data -> objc = 5 - */ - - if (objc == 2) { - encoding = Tcl_GetEncoding(interp, NULL); - data = objv[1]; - } else if (objc > 2 && objc < 7) { - int objcUnprocessed = objc; - data = objv[objc - 1]; - stringPtr = Tcl_GetString(objv[1]); - if (stringPtr[0] == '-' && stringPtr[1] == 'n' - && !strncmp(stringPtr, "-nocomplain", strlen(stringPtr))) { - flags = TCL_ENCODING_NOCOMPLAIN; - objcUnprocessed--; - } else if (stringPtr[0] == '-' && stringPtr[1] == 's' - && !strncmp(stringPtr, "-strict", strlen(stringPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed--; - stringPtr = Tcl_GetString(objv[2]); - if (stringPtr[0] == '-' && stringPtr[1] == 'f' - && !strncmp(stringPtr, "-failindex", strlen(stringPtr))) { - /* at least two additional arguments needed */ - if (objc < 6) { - goto encConvToError; - } - failVarObj = objv[3]; - objcUnprocessed -= 2; - } - } else if (stringPtr[0] == '-' && stringPtr[1] == 'f' - && !strncmp(stringPtr, "-failindex", strlen(stringPtr))) { - /* at least two additional arguments needed */ - if (objc < 4) { - goto encConvToError; - } - failVarObj = objv[2]; - flags = TCL_ENCODING_STOPONERROR; - objcUnprocessed -= 2; - stringPtr = Tcl_GetString(objv[3]); - if (stringPtr[0] == '-' && stringPtr[1] == 's' - && !strncmp(stringPtr, "-strict", strlen(stringPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed --; - } - } - switch (objcUnprocessed) { - case 3: - if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { - return TCL_ERROR; - } - break; - case 2: - encoding = Tcl_GetEncoding(interp, NULL); - break; - default: - goto encConvToError; - } - } else { - encConvToError: - Tcl_WrongNumArgs(interp, 1, objv, "?-strict? ?-failindex var? ?encoding? data"); - ((Interp *) interp)->flags |= INTERP_ALTERNATE_WRONG_ARGS; - Tcl_WrongNumArgs(interp, 1, objv, "-nocomplain ?encoding? data"); + int result; + int flags; + Tcl_Obj *failVarObj; + Tcl_Size errorLocation; + if (EncodingConvertParseOptions( + interp, objc, objv, &encoding, &data, &flags, &failVarObj) + != TCL_OK) { return TCL_ERROR; } @@ -650,40 +651,53 @@ EncodingConverttoObjCmd( */ stringPtr = Tcl_GetStringFromObj(data, &length); - result = Tcl_UtfToExternalDStringEx(encoding, stringPtr, length, - flags, &ds); - if ((!(flags & TCL_ENCODING_NOCOMPLAIN) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) && (result != TCL_INDEX_NONE)) { - if (failVarObj != NULL) { - /* I hope, wide int will cover size_t data type */ - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { - return TCL_ERROR; - } - } else { - size_t pos = Tcl_NumUtfChars(stringPtr, result); - int ucs4; - char buf[TCL_INTEGER_SPACE]; - TclUtfToUCS4(&stringPtr[result], &ucs4); - sprintf(buf, "%" TCL_Z_MODIFIER "u", result); - Tcl_SetObjResult(interp, Tcl_ObjPrintf("unexpected character at index %" - TCL_Z_MODIFIER "u: 'U+%06X'", pos, ucs4)); - Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", - buf, NULL); - Tcl_DStringFree(&ds); + result = Tcl_UtfToExternalDStringEx(interp, encoding, stringPtr, length, flags, + &ds, failVarObj ? &errorLocation : NULL); + /* NOTE: ds must be freed beyond this point even on error */ + + switch (result) { + case TCL_OK: + errorLocation = TCL_INDEX_NONE; + break; + case TCL_ERROR: + /* Error in parameters. Should not happen. interp will have error */ + Tcl_DStringFree(&ds); + return TCL_ERROR; + default: + /* + * One of the TCL_CONVERT_* errors. If we were not interested in the + * error location, interp result would already have been filled in + * and we can just return the error. Otherwise, we have to return + * what could be decoded and the returned error location. + */ + if (failVarObj == NULL) { + Tcl_DStringFree(&ds); return TCL_ERROR; } - } else if (failVarObj != NULL) { - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewIntObj(-1), TCL_LEAVE_ERR_MSG) == NULL) { + break; + } + /* + * TCL_OK or a TCL_CONVERT_* error where the caller wants back as much + * data as was converted. + */ + if (failVarObj) { + /* I hope, wide int will cover Tcl_Size data type */ + if (Tcl_ObjSetVar2(interp, + failVarObj, + NULL, + Tcl_NewWideIntObj(errorLocation), + TCL_LEAVE_ERR_MSG) == NULL) { + Tcl_DStringFree(&ds); return TCL_ERROR; } } + Tcl_SetObjResult(interp, Tcl_NewByteArrayObj((unsigned char*) Tcl_DStringValue(&ds), Tcl_DStringLength(&ds))); Tcl_DStringFree(&ds); - /* - * We're done with the encoding - */ + /* We're done with the encoding */ Tcl_FreeEncoding(encoding); return TCL_OK; @@ -768,6 +782,34 @@ EncodingNamesObjCmd( /* *----------------------------------------------------------------------------- * + * EncodingProfilesObjCmd -- + * + * This command returns a list of the available encoding profiles + * + * Results: + * Returns a standard Tcl result + * + *----------------------------------------------------------------------------- + */ + +int +EncodingProfilesObjCmd( + TCL_UNUSED(void *), + Tcl_Interp* interp, /* Tcl interpreter */ + int objc, /* Number of command line args */ + Tcl_Obj* const objv[]) /* Vector of command line args */ +{ + if (objc > 1) { + Tcl_WrongNumArgs(interp, 1, objv, NULL); + return TCL_ERROR; + } + TclGetEncodingProfiles(interp); + return TCL_OK; +} + +/* + *----------------------------------------------------------------------------- + * * EncodingSystemObjCmd -- * * This command retrieves or changes the system encoding diff --git a/generic/tclDecls.h b/generic/tclDecls.h index f219500..bdc094d 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1766,13 +1766,17 @@ EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); /* 657 */ EXTERN int Tcl_UniCharIsUnicode(int ch); /* 658 */ -EXTERN Tcl_Size Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, - Tcl_DString *dsPtr); +EXTERN int Tcl_ExternalToUtfDStringEx(Tcl_Interp *interp, + Tcl_Encoding encoding, const char *src, + Tcl_Size srcLen, int flags, + Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr); /* 659 */ -EXTERN Tcl_Size Tcl_UtfToExternalDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, - Tcl_DString *dsPtr); +EXTERN int Tcl_UtfToExternalDStringEx(Tcl_Interp *interp, + Tcl_Encoding encoding, const char *src, + Tcl_Size srcLen, int flags, + Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr); /* 660 */ EXTERN int Tcl_AsyncMarkFromSignal(Tcl_AsyncHandler async, int sigNumber); @@ -2529,8 +2533,8 @@ typedef struct TclStubs { const char * (*tcl_UtfNext) (const char *src); /* 655 */ const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ - Tcl_Size (*tcl_ExternalToUtfDStringEx) (Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr); /* 658 */ - Tcl_Size (*tcl_UtfToExternalDStringEx) (Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr); /* 659 */ + int (*tcl_ExternalToUtfDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 658 */ + int (*tcl_UtfToExternalDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 659 */ int (*tcl_AsyncMarkFromSignal) (Tcl_AsyncHandler async, int sigNumber); /* 660 */ int (*tcl_ListObjGetElements) (Tcl_Interp *interp, Tcl_Obj *listPtr, size_t *objcPtr, Tcl_Obj ***objvPtr); /* 661 */ int (*tcl_ListObjLength) (Tcl_Interp *interp, Tcl_Obj *listPtr, size_t *lengthPtr); /* 662 */ @@ -3956,12 +3960,12 @@ extern const TclStubs *tclStubsPtr; #undef Tcl_UtfToExternalDString #define Tcl_UtfToExternalDString(encoding, src, len, ds) \ - (Tcl_UtfToExternalDStringEx((encoding), (src), (len), \ - TCL_ENCODING_NOCOMPLAIN, (ds)), Tcl_DStringValue(ds)) + (Tcl_UtfToExternalDStringEx(NULL, (encoding), (src), (len), \ + TCL_ENCODING_PROFILE_TCL8, (ds), NULL), Tcl_DStringValue(ds)) #undef Tcl_ExternalToUtfDString #define Tcl_ExternalToUtfDString(encoding, src, len, ds) \ - (Tcl_ExternalToUtfDStringEx((encoding), (src), (len), \ - TCL_ENCODING_NOCOMPLAIN, (ds)), Tcl_DStringValue(ds)) + (Tcl_ExternalToUtfDStringEx(NULL, (encoding), (src), (len), \ + TCL_ENCODING_PROFILE_TCL8, (ds), NULL), Tcl_DStringValue(ds)) #if defined(USE_TCL_STUBS) # if defined(_WIN32) && defined(_WIN64) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index ce5626f..68f22b0 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -188,6 +188,32 @@ static Tcl_Encoding systemEncoding = NULL; Tcl_Encoding tclIdentityEncoding = NULL; /* + * Names of encoding profiles and corresponding integer values + */ +static struct TclEncodingProfiles { + const char *name; + int value; +} encodingProfiles[] = { + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"replace", TCL_ENCODING_PROFILE_REPLACE}, +}; +#define PROFILE_STRICT(flags_) \ + ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ + || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) + +#define PROFILE_REPLACE(flags_) \ + ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) \ + || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE)) + +#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD) +#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) +#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) +#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) + +/* * The following variable is used in the sparse matrix code for a * TableEncoding to represent a page in the table that has no entries. */ @@ -230,6 +256,7 @@ static Tcl_EncodingConvertProc UtfToUtfProc; static Tcl_EncodingConvertProc Iso88591FromUtfProc; static Tcl_EncodingConvertProc Iso88591ToUtfProc; + /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field * of the internalrep. This should help the lifetime of encodings be more useful. @@ -1114,7 +1141,8 @@ Tcl_ExternalToUtfDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_ExternalToUtfDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1128,34 +1156,55 @@ Tcl_ExternalToUtfDString( * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in utf-8. * Possible flags values: - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * - TCL_ENCODING_MODIFIED: enable Tcl internal conversion mapping \xC0\x80 + * to 0x00. Only valid for "utf-8" and "cesu-8". + * Any other flag bits will cause an error to be returned (for future + * compatibility) * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. - * + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. + * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner. + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ -Tcl_Size +int Tcl_ExternalToUtfDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; @@ -1164,7 +1213,18 @@ Tcl_ExternalToUtfDStringEx( Tcl_Size dstLen; const char *srcStart = src; + /* DO FIRST - Must always be initialized before returning */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetResult(interp, + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_STATIC); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; @@ -1179,6 +1239,7 @@ Tcl_ExternalToUtfDStringEx( srcLen = encodingPtr->lengthProc(src); } + flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { flags |= ENCODING_INPUT; @@ -1189,19 +1250,45 @@ Tcl_ExternalToUtfDStringEx( flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); - src += srcRead; - if (result != TCL_CONVERT_NOSPACE) { - Tcl_DStringSetLength(dstPtr, soFar); - return (result == TCL_OK) ? TCL_INDEX_NONE : (Tcl_Size)(src - srcStart); - } - flags &= ~TCL_ENCODING_START; - srcLen -= srcRead; - if (Tcl_DStringLength(dstPtr) == 0) { - Tcl_DStringSetLength(dstPtr, dstLen); - } - Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); - dst = Tcl_DStringValue(dstPtr) + soFar; - dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; + src += srcRead; + if (result != TCL_CONVERT_NOSPACE) { + Tcl_Size nBytesProcessed = (src - srcStart); + + Tcl_DStringSetLength(dstPtr, soFar); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } + else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + char buf[TCL_INTEGER_SPACE]; + sprintf(buf, "%" TCL_Z_MODIFIER "u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf("unexpected byte sequence starting at index %" + TCL_Z_MODIFIER "u: '\\x%X'", + nBytesProcessed, + UCHAR(srcStart[nBytesProcessed]))); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL); + } + } + return result; + } + + /* Expand space and continue */ + flags &= ~TCL_ENCODING_START; + srcLen -= srcRead; + if (Tcl_DStringLength(dstPtr) == 0) { + Tcl_DStringSetLength(dstPtr, dstLen); + } + Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); + dst = Tcl_DStringValue(dstPtr) + soFar; + dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; } } @@ -1351,7 +1438,8 @@ Tcl_UtfToExternalDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_UtfToExternalDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_DEFAULT, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1364,36 +1452,53 @@ Tcl_UtfToExternalDString( * Convert a source buffer from UTF-8 to the specified encoding. * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in the - * target encoding. - * Possible flags values: - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * - TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 instead + * of 0x00. Only valid for "utf-8" and "cesu-8". * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ -Tcl_Size +int Tcl_UtfToExternalDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ Tcl_Size srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; @@ -1402,7 +1507,18 @@ Tcl_UtfToExternalDStringEx( const char *srcStart = src; Tcl_Size dstLen; + /* DO FIRST - must always be initialized on return */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetResult(interp, + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_STATIC); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; @@ -1416,20 +1532,49 @@ Tcl_UtfToExternalDStringEx( } else if (srcLen == TCL_INDEX_NONE) { srcLen = strlen(src); } + + flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, - srcLen, flags, &state, dst, dstLen, - &srcRead, &dstWrote, &dstChars); + srcLen, flags, &state, dst, dstLen, + &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); src += srcRead; if (result != TCL_CONVERT_NOSPACE) { + Tcl_Size nBytesProcessed = (src - srcStart); int i = soFar + encodingPtr->nullSize - 1; while (i >= soFar) { Tcl_DStringSetLength(dstPtr, i--); } - return (result == TCL_OK) ? TCL_INDEX_NONE : (Tcl_Size)(src - srcStart); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } + else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + Tcl_Size pos = Tcl_NumUtfChars(srcStart, nBytesProcessed); + int ucs4; + char buf[TCL_INTEGER_SPACE]; + TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4); + sprintf(buf, "%" TCL_Z_MODIFIER "u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "unexpected character at index %" TCL_Z_MODIFIER + "u: 'U+%06X'", + pos, + ucs4)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", + buf, NULL); + } + } + return result; } flags &= ~TCL_ENCODING_START; @@ -2257,14 +2402,12 @@ BinaryProc( *------------------------------------------------------------------------- */ -#define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN)) - static int UtfToUtfProc( void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ + int flags, /* TCL_ENCODING_* conversion control flags. */ TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ @@ -2286,6 +2429,7 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int ch; + int profile; result = TCL_OK; @@ -2303,7 +2447,9 @@ UtfToUtfProc( flags |= PTR2INT(clientData); dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); + profile = TCL_ENCODING_PROFILE_GET(flags); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the @@ -2324,25 +2470,34 @@ UtfToUtfProc( */ *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED) && (!(flags & ENCODING_INPUT) - || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - || (flags & ENCODING_FAILINDEX))) { - /* - * If in input mode, and -strict or -failindex is specified: This is an error. - */ - if ((STOPONERROR) && (flags & ENCODING_INPUT)) { - result = TCL_CONVERT_SYNTAX; - break; + } + else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && + (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED) && + (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || + PROFILE_REPLACE(profile))) { + /* Special sequence \xC0\x80 */ + if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) { + if (PROFILE_REPLACE(profile)) { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + src += 2; + } else { + /* PROFILE_STRICT */ + result = TCL_CONVERT_SYNTAX; + break; + } + } else { + /* + * Convert 0xC080 to real nulls when we are in output mode, + * irrespective of the profile. + */ + *dst++ = 0; + src += 2; } + } + else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* - * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'. - */ - *dst++ = 0; - src += 2; - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { - /* + * Incomplete byte sequence. * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves @@ -2350,32 +2505,45 @@ UtfToUtfProc( */ if (flags & ENCODING_INPUT) { - if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { - result = TCL_CONVERT_MULTIBYTE; + /* Incomplete bytes for modified UTF-8 target */ + if (PROFILE_STRICT(profile)) { + result = (flags & TCL_ENCODING_CHAR_LIMIT) + ? TCL_CONVERT_MULTIBYTE + : TCL_CONVERT_SYNTAX; break; } - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX)) { - result = TCL_CONVERT_SYNTAX; - break; - } - } - char chbuf[2]; - chbuf[0] = UCHAR(*src++); chbuf[1] = 0; - TclUtfToUCS4(chbuf, &ch); + } + if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + ++src; + } else { + /* TCL_ENCODING_PROFILE_TCL8 */ + char chbuf[2]; + chbuf[0] = UCHAR(*src++); chbuf[1] = 0; + TclUtfToUCS4(chbuf, &ch); + } dst += Tcl_UniCharToUtf(ch, dst); - } else { + } + else { + int isInvalid = 0; size_t len = TclUtfToUCS4(src, &ch); if (flags & ENCODING_INPUT) { - if ((len < 2) && (ch != 0) - && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) { - goto utf8Syntax; - } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF) - && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) { - utf8Syntax: - result = TCL_CONVERT_SYNTAX; - break; + if ((len < 2) && (ch != 0)) { + isInvalid = 1; + } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) { + isInvalid = 1; + } + if (isInvalid) { + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_SYNTAX; + break; + } + else if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } } } + const char *saveSrc = src; src += len; if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) { @@ -2399,34 +2567,42 @@ UtfToUtfProc( /* * A surrogate character is detected, handle especially. */ - - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) && (flags & ENCODING_UTF)) { + if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } - int low = ch; - len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - - if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - - if (STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; + if (PROFILE_REPLACE(profile)) { + /* TODO - is this right for cesu8 or should we fall through below? */ + ch = UNICODE_REPLACE_CHAR; + } + else { + int low = ch; + len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0; + + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { + + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + goto cesu8; } - goto cesu8; + src += len; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } - src += len; - dst += Tcl_UniCharToUtf(ch, dst); - ch = low; #endif - } else if (STOPONERROR && !(flags & ENCODING_INPUT) && (((ch & ~0x7FF) == 0xD800))) { + } else if (PROFILE_STRICT(profile) && + (!(flags & ENCODING_INPUT)) && + SURROGATE(ch)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && (flags & ENCODING_INPUT) && ((ch & ~0x7FF) == 0xD800)) { + } else if (PROFILE_STRICT(profile) && + (flags & ENCODING_INPUT) && + SURROGATE(ch)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; break; @@ -2494,8 +2670,8 @@ Utf32ToUtfProc( /* * Check alignment with utf-32 (4 == sizeof(UTF-32)) */ - if (bytesLeft != 0) { + /* We have a truncated code unit */ result = TCL_CONVERT_MULTIBYTE; srcLen -= bytesLeft; } @@ -2517,17 +2693,14 @@ Utf32ToUtfProc( } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } - if ((unsigned)ch > 0x10FFFF) { - if (STOPONERROR) { + + if ((unsigned)ch > 0x10FFFF || SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } - ch = 0xFFFD; - } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && ((ch & ~0x7FF) == 0xD800)) { - if (STOPONERROR) { - result = TCL_CONVERT_SYNTAX; - break; + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; } } @@ -2541,25 +2714,31 @@ Utf32ToUtfProc( } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned int); + src += 4; } + + + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + */ if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - /* We have a single byte left-over at the end */ if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - /* destination is not full, so we really are at the end now */ - if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) { - result = TCL_CONVERT_SYNTAX; - } else { - result = TCL_OK; - dst += Tcl_UniCharToUtf(0xFFFD, dst); - numChars++; - src += bytesLeft; - } - } + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src += bytesLeft; /* Go past truncated code unit */ + } + } } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2636,11 +2815,14 @@ UtfToUtf32Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -2772,22 +2954,27 @@ Utf16ToUtfProc( /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } + + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is not "strict", stick FFFD in its place. + */ if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { - /* We have a single byte left-over at the end */ if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - /* destination is not full, so we really are at the end now */ - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { - result = TCL_CONVERT_SYNTAX; - } else { - result = TCL_OK; - dst += Tcl_UniCharToUtf(0xFFFD, dst); - numChars++; - src++; - } - } + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src++; /* Go past truncated code unit */ + } + } } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2864,11 +3051,14 @@ UtfToUtf16Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -2971,25 +3161,25 @@ UtfToUcs2Proc( #if TCL_UTF_MAX < 4 len = TclUtfToUniChar(src, &ch); if ((ch >= 0xD800) && (len < 3)) { - if (STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - break; + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; } src += len; src += TclUtfToUniChar(src, &ch); - ch = 0xFFFD; + ch = UNICODE_REPLACE_CHAR; } #else len = TclUtfToUniChar(src, &ch); if (ch > 0xFFFF) { - if (STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - break; + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_UNKNOWN; + break; } - ch = 0xFFFD; + ch = UNICODE_REPLACE_CHAR; } #endif - if (STOPONERROR && ((ch & ~0x7FF) == 0xD800)) { + if (PROFILE_STRICT(flags) && ((ch & ~0x7FF) == 0xD800)) { result = TCL_CONVERT_SYNTAX; break; } @@ -3087,24 +3277,35 @@ TableToUtfProc( if (prefixBytes[byte]) { src++; if (src >= srcEnd) { + /* + * TODO - this is broken. For consistency with other + * decoders, an error should be raised only if strict. + * However, doing that check cause a whole bunch of test + * failures. Need to verify if those tests are in fact + * correct. + */ src--; result = TCL_CONVERT_MULTIBYTE; break; } + ch = toUnicode[byte][*((unsigned char *)src)]; ch = toUnicode[byte][*((unsigned char *) src)]; } else { ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if ((flags & ENCODING_FAILINDEX) - || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } if (prefixBytes[byte]) { src--; } - ch = (Tcl_UniChar) byte; + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + ch = (Tcl_UniChar)byte; + } } /* @@ -3213,11 +3414,11 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } - word = dataPtr->fallback; + word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */ } if (prefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { @@ -3401,7 +3602,7 @@ Iso88591FromUtfProc( || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3414,7 +3615,7 @@ Iso88591FromUtfProc( * Plunge on, using '?' as a fallback character. */ - ch = (Tcl_UniChar) '?'; + ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */ } if (dst > dstEnd) { @@ -3628,9 +3829,10 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if (!STOPONERROR) { + if (!PROFILE_STRICT(flags)) { /* - * Skip the unknown escape sequence. + * Skip the unknown escape sequence. TODO - bug? + * May be replace with UNICODE_REPLACE_CHAR? */ src += longest; @@ -3803,7 +4005,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -4097,6 +4299,158 @@ InitializeEncodingSearchPath( } /* + *------------------------------------------------------------------------ + * + * TclEncodingProfileParseName -- + * + * Maps an encoding profile name to its integer equivalent. + * + * Results: + * TCL_OK on success or TCL_ERROR on failure. + * + * Side effects: + * Returns the profile enum value in *profilePtr + * + *------------------------------------------------------------------------ + */ +int +TclEncodingProfileNameToId( + Tcl_Interp *interp, /* For error messages. May be NULL */ + const char *profileName, /* Name of profile */ + int *profilePtr) /* Output */ +{ + size_t i; + + for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + if (!strcmp(profileName, encodingProfiles[i].name)) { + *profilePtr = encodingProfiles[i].value; + return TCL_OK; + } + } + if (interp) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "bad profile \"%s\". Must be \"tcl8\" or \"strict\".", + profileName)); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILE", profileName, NULL); + } + return TCL_ERROR; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingProfileValueToName -- + * + * Maps an encoding profile value to its name. + * + * Results: + * Pointer to the name or NULL on failure. Caller must not make + * not modify the string and must make a copy to hold on to it. + * + * Side effects: + * None. + *------------------------------------------------------------------------ + */ +const char * +TclEncodingProfileIdToName( + Tcl_Interp *interp, /* For error messages. May be NULL */ + int profileValue) /* Profile #define value */ +{ + size_t i; + + for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + if (profileValue == encodingProfiles[i].value) { + return encodingProfiles[i].name; + } + } + if (interp) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "Internal error. Bad profile id \"%d\".", + profileValue)); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILEID", NULL); + } + return NULL; +} + +/* + *------------------------------------------------------------------------ + * + * TclEncodingExternalFlagsToInternal -- + * + * Maps the flags supported in the encoding C API's to internal flags. + * + * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is + * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile + * specified. + * + * If no profile or an invalid profile is specified, it is set to + * the default. + * + * Results: + * Internal encoding flag mask. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------ + */ +int TclEncodingExternalFlagsToInternal(int flags) +{ + if (flags & TCL_ENCODING_STOPONERROR) { + TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT); + } + else { + int profile = TCL_ENCODING_PROFILE_GET(flags); + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: + case TCL_ENCODING_PROFILE_STRICT: + case TCL_ENCODING_PROFILE_REPLACE: + break; + case 0: /* Unspecified by caller */ + default: + TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT); + break; + } + } + return flags; +} + +/* + *------------------------------------------------------------------------ + * + * TclGetEncodingProfiles -- + * + * Get the list of supported encoding profiles. + * + * Results: + * None. + * + * Side effects: + * The list of profile names is stored in the interpreter result. + * + *------------------------------------------------------------------------ + */ +void +TclGetEncodingProfiles(Tcl_Interp *interp) +{ + int i, n; + Tcl_Obj *objPtr; + n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + objPtr = Tcl_NewListObj(n, NULL); + for (i = 0; i < n; ++i) { + Tcl_ListObjAppendElement( + interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, -1)); + } + Tcl_SetObjResult(interp, objPtr); +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 diff --git a/generic/tclIO.c b/generic/tclIO.c index 26d0011..6d6a935 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -1675,8 +1675,12 @@ Tcl_CreateChannel( } statePtr->inputEncodingState = NULL; statePtr->inputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, + TCL_ENCODING_PROFILE_DEFAULT); statePtr->outputEncodingState = NULL; statePtr->outputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, + TCL_ENCODING_PROFILE_DEFAULT); /* * Set the channel up initially in AUTO input translation mode to accept @@ -4343,21 +4347,6 @@ Write( } /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->outputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; - } else { - statePtr->outputEncodingFlags &= ~TCL_ENCODING_NOCOMPLAIN; - } - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->outputEncodingFlags |= TCL_ENCODING_STRICT; - } else { - statePtr->outputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - - /* * Write the terminated escape sequence even if srcLen is 0. */ @@ -4681,21 +4670,6 @@ Tcl_GetsObj( } /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_NOCOMPLAIN; - } - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_STRICT; - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - - /* * Object used by FilterInputBytes to keep track of how much data has been * consumed from the channel buffers. */ @@ -5458,21 +5432,6 @@ FilterInputBytes( } gsPtr->state = statePtr->inputEncodingState; - /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_NOCOMPLAIN; - } - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_STRICT; - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - result = Tcl_ExternalToUtf(NULL, gsPtr->encoding, raw, rawLen, statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE, &statePtr->inputEncodingState, dst, spaceLeft, &gsPtr->rawRead, @@ -6259,21 +6218,6 @@ ReadChars( } /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_NOCOMPLAIN; - } - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_STRICT; - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - - /* * This routine is burdened with satisfying several constraints. It cannot * append more than 'charsToRead` chars onto objPtr. This is measured * after encoding and translation transformations are completed. There is @@ -7810,7 +7754,7 @@ Tcl_BadChannelOption( { if (interp != NULL) { const char *genericopt = - "blocking buffering buffersize encoding eofchar nocomplainencoding strictencoding translation"; + "blocking buffering buffersize encoding encodingprofile eofchar translation"; const char **argv; size_t argc, i; Tcl_DString ds; @@ -7951,7 +7895,7 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(2, "-encoding")) { + if (len == 0 || HaveOpt(8, "-encoding")) { if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-encoding"); } @@ -7965,39 +7909,36 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(2, "-eofchar")) { - char buf[4] = ""; + if (len == 0 || HaveOpt(9, "-encodingprofile")) { + int profile; + const char *profileName; if (len == 0) { - Tcl_DStringAppendElement(dsPtr, "-eofchar"); + Tcl_DStringAppendElement(dsPtr, "-encodingprofile"); } - if ((flags & TCL_READABLE) && (statePtr->inEofChar != 0)) { - sprintf(buf, "%c", statePtr->inEofChar); + /* Note currently input and output profiles are same */ + profile = TCL_ENCODING_PROFILE_GET(statePtr->inputEncodingFlags); + profileName = TclEncodingProfileIdToName(interp, profile); + if (profileName == NULL) { + return TCL_ERROR; } + Tcl_DStringAppendElement(dsPtr, profileName); if (len > 0) { - Tcl_DStringAppend(dsPtr, buf, TCL_INDEX_NONE); return TCL_OK; } - Tcl_DStringAppendElement(dsPtr, buf); } - if (len == 0 || HaveOpt(1, "-nocomplainencoding")) { + if (len == 0 || HaveOpt(2, "-eofchar")) { + char buf[4] = ""; if (len == 0) { - Tcl_DStringAppendElement(dsPtr, "-nocomplainencoding"); - } - Tcl_DStringAppendElement(dsPtr, - (flags & CHANNEL_ENCODING_NOCOMPLAIN) ? "1" : "0"); - if (len > 0) { - return TCL_OK; + Tcl_DStringAppendElement(dsPtr, "-eofchar"); } - } - if (len == 0 || HaveOpt(1, "-strictencoding")) { - if (len == 0) { - Tcl_DStringAppendElement(dsPtr, "-strictencoding"); + if ((flags & TCL_READABLE) && (statePtr->inEofChar != 0)) { + sprintf(buf, "%c", statePtr->inEofChar); } - Tcl_DStringAppendElement(dsPtr, - (flags & CHANNEL_ENCODING_STRICT) ? "1" : "0"); if (len > 0) { + Tcl_DStringAppend(dsPtr, buf, TCL_INDEX_NONE); return TCL_OK; } + Tcl_DStringAppendElement(dsPtr, buf); } if (len == 0 || HaveOpt(1, "-translation")) { if (len == 0) { @@ -8180,6 +8121,7 @@ Tcl_SetChannelOption( return TCL_OK; } else if (HaveOpt(2, "-encoding")) { Tcl_Encoding encoding; + int profile; if ((newValue[0] == '\0') || (strcmp(newValue, "binary") == 0)) { encoding = NULL; @@ -8204,9 +8146,12 @@ Tcl_SetChannelOption( Tcl_FreeEncoding(statePtr->encoding); statePtr->encoding = encoding; statePtr->inputEncodingState = NULL; + profile = TCL_ENCODING_PROFILE_GET(statePtr->inputEncodingFlags); statePtr->inputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); statePtr->outputEncodingState = NULL; statePtr->outputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); /* Same as input */ ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); UpdateInterest(chanPtr); return TCL_OK; @@ -8244,30 +8189,13 @@ Tcl_SetChannelOption( ResetFlag(statePtr, CHANNEL_EOF|CHANNEL_STICKY_EOF|CHANNEL_BLOCKED); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; return TCL_OK; - } else if (HaveOpt(1, "-nocomplainencoding")) { - int newMode; - - if (Tcl_GetBoolean(interp, newValue, &newMode) == TCL_ERROR) { - return TCL_ERROR; - } - if (newMode) { - SetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); - } else { - ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); - } - ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); - return TCL_OK; - } else if (HaveOpt(1, "-strictencoding")) { - int newMode; - - if (Tcl_GetBoolean(interp, newValue, &newMode) == TCL_ERROR) { + } else if (HaveOpt(1, "-encodingprofile")) { + int profile; + if (TclEncodingProfileNameToId(interp, newValue, &profile) != TCL_OK) { return TCL_ERROR; } - if (newMode) { - SetFlag(statePtr, CHANNEL_ENCODING_STRICT); - } else { - ResetFlag(statePtr, CHANNEL_ENCODING_STRICT); - } + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); return TCL_OK; } else if (HaveOpt(1, "-translation")) { @@ -9344,12 +9272,17 @@ TclCopyChannel( * of the bytes themselves. */ + /* + * TODO - should really only allow lossless profiles. Below reflects + * Tcl 8.7 alphas prior to encoding profiles + */ + moveBytes = inStatePtr->inEofChar == '\0' /* No eofChar to stop input */ && inStatePtr->inputTranslation == TCL_TRANSLATE_LF && outStatePtr->outputTranslation == TCL_TRANSLATE_LF && inStatePtr->encoding == outStatePtr->encoding - && (inStatePtr->flags & TCL_ENCODING_STRICT) != TCL_ENCODING_STRICT - && outStatePtr->flags & TCL_ENCODING_NOCOMPLAIN; + && TCL_ENCODING_PROFILE_GET(inStatePtr->flags) != TCL_ENCODING_PROFILE_STRICT + && TCL_ENCODING_PROFILE_GET(outStatePtr->flags) == TCL_ENCODING_PROFILE_TCL8; /* * Allocate a new CopyState to maintain info about the current copy in @@ -9677,8 +9610,8 @@ CopyData( inBinary = (inStatePtr->encoding == NULL); outBinary = (outStatePtr->encoding == NULL); sameEncoding = inStatePtr->encoding == outStatePtr->encoding - && (inStatePtr->flags & TCL_ENCODING_STRICT) != TCL_ENCODING_STRICT - && outStatePtr->flags & TCL_ENCODING_NOCOMPLAIN; + && TCL_ENCODING_PROFILE_GET(inStatePtr->flags) != TCL_ENCODING_PROFILE_STRICT + && TCL_ENCODING_PROFILE_GET(outStatePtr->flags) == TCL_ENCODING_PROFILE_TCL8; if (!(inBinary || sameEncoding)) { TclNewObj(bufObj); diff --git a/generic/tclIO.h b/generic/tclIO.h index 62cf6e8..a050010 100644 --- a/generic/tclIO.h +++ b/generic/tclIO.h @@ -277,16 +277,11 @@ typedef struct ChannelState { * encountered an encoding error */ #define CHANNEL_RAW_MODE (1<<16) /* When set, notes that the Raw API is * being used. */ -#define CHANNEL_ENCODING_NOCOMPLAIN (1<<17) /* set if option - * -nocomplainencoding is set to 1 */ -#define CHANNEL_ENCODING_STRICT (1<<18) /* set if option - * -strictencoding is set to 1 */ #define CHANNEL_INCLOSE (1<<19) /* Channel is currently being closed. * Its structures are still live and * usable, but it may not be closed * again from within the close * handler. */ -#define ENCODING_FAILINDEX (1<<20) /* Internal flag, fail on Invalid bytes only */ #define CHANNEL_CLOSEDWRITE (1<<21) /* Channel write side has been closed. * No further Tcl-level write IO on * the channel is allowed. */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 827fd6f..9a9c0ae 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2875,7 +2875,19 @@ MODULE_SCOPE int tclFindExecutableSearchDone; MODULE_SCOPE char *tclMemDumpFileName; MODULE_SCOPE TclPlatformType tclPlatform; +/* + * Declarations related to internal encoding functions. + */ + MODULE_SCOPE Tcl_Encoding tclIdentityEncoding; +MODULE_SCOPE int +TclEncodingProfileNameToId(Tcl_Interp *interp, + const char *profileName, + int *profilePtr); +MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp, + int profileId); +MODULE_SCOPE int TclEncodingExternalFlagsToInternal(int flags); +MODULE_SCOPE void TclGetEncodingProfiles(Tcl_Interp *interp); /* * TIP #233 (Virtualized Time) @@ -4748,6 +4760,8 @@ MODULE_SCOPE Tcl_LibraryInitProc TclThread_Init; MODULE_SCOPE Tcl_LibraryInitProc Procbodytest_Init; MODULE_SCOPE Tcl_LibraryInitProc Procbodytest_SafeInit; + + /* *---------------------------------------------------------------- * Macro used by the Tcl core to check whether a pattern has any characters diff --git a/generic/tclUtil.c b/generic/tclUtil.c index e96a564..3abd615 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -4026,10 +4026,11 @@ TclGetProcessGlobalValue( Tcl_MutexLock(&pgvPtr->mutex); epoch = ++pgvPtr->epoch; - Tcl_UtfToExternalDStringEx(pgvPtr->encoding, pgvPtr->value, - pgvPtr->numBytes, TCL_ENCODING_NOCOMPLAIN, &native); - Tcl_ExternalToUtfDStringEx(current, Tcl_DStringValue(&native), - Tcl_DStringLength(&native), TCL_ENCODING_NOCOMPLAIN, &newValue); + Tcl_UtfToExternalDStringEx(NULL, pgvPtr->encoding, pgvPtr->value, + pgvPtr->numBytes, TCL_ENCODING_PROFILE_TCL8, &native, NULL); + Tcl_ExternalToUtfDStringEx(NULL, current, Tcl_DStringValue(&native), + Tcl_DStringLength(&native), TCL_ENCODING_PROFILE_TCL8, + &newValue, NULL); Tcl_DStringFree(&native); Tcl_Free(pgvPtr->value); pgvPtr->value = (char *)Tcl_Alloc(Tcl_DStringLength(&newValue) + 1); diff --git a/generic/tclZlib.c b/generic/tclZlib.c index 5a6dbc4..dc7c3f3 100644 --- a/generic/tclZlib.c +++ b/generic/tclZlib.c @@ -547,8 +547,8 @@ ExtractHeader( } } - Tcl_ExternalToUtfDStringEx(latin1enc, (char *) headerPtr->comment, -1, - TCL_ENCODING_NOCOMPLAIN, &tmp); + Tcl_ExternalToUtfDStringEx(NULL, latin1enc, (char *) headerPtr->comment, -1, + TCL_ENCODING_PROFILE_TCL8, &tmp, NULL); SetValue(dictObj, "comment", Tcl_DStringToObj(&tmp)); } SetValue(dictObj, "crc", Tcl_NewBooleanObj(headerPtr->hcrc)); @@ -564,8 +564,8 @@ ExtractHeader( } } - Tcl_ExternalToUtfDStringEx(latin1enc, (char *) headerPtr->name, -1, - TCL_ENCODING_NOCOMPLAIN, &tmp); + Tcl_ExternalToUtfDStringEx(NULL, latin1enc, (char *) headerPtr->name, -1, + TCL_ENCODING_PROFILE_TCL8, &tmp, NULL); SetValue(dictObj, "filename", Tcl_DStringToObj(&tmp)); } if (headerPtr->os != 255) { diff --git a/unix/tclUnixChan.c b/unix/tclUnixChan.c index c41cdd9..b81676e 100644 --- a/unix/tclUnixChan.c +++ b/unix/tclUnixChan.c @@ -1028,11 +1028,11 @@ TtyGetOptionProc( tcgetattr(fsPtr->fileState.fd, &iostate); Tcl_DStringInit(&ds); - Tcl_ExternalToUtfDStringEx(NULL, (char *) &iostate.c_cc[VSTART], 1, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_ExternalToUtfDStringEx(NULL, NULL, (char *) &iostate.c_cc[VSTART], 1, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds)); TclDStringClear(&ds); - Tcl_ExternalToUtfDStringEx(NULL, (char *) &iostate.c_cc[VSTOP], 1, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_ExternalToUtfDStringEx(NULL, NULL, (char *) &iostate.c_cc[VSTOP], 1, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds)); Tcl_DStringFree(&ds); } diff --git a/unix/tclUnixFCmd.c b/unix/tclUnixFCmd.c index b205061..7753cec 100644 --- a/unix/tclUnixFCmd.c +++ b/unix/tclUnixFCmd.c @@ -762,16 +762,16 @@ TclpObjCopyDirectory( Tcl_Obj *transPtr; transPtr = Tcl_FSGetTranslatedPath(NULL,srcPathPtr); - Tcl_UtfToExternalDStringEx(NULL, + Tcl_UtfToExternalDStringEx(NULL, NULL, (transPtr != NULL ? TclGetString(transPtr) : NULL), - -1, TCL_ENCODING_NOCOMPLAIN, &srcString); + -1, TCL_ENCODING_PROFILE_TCL8, &srcString, NULL); if (transPtr != NULL) { Tcl_DecrRefCount(transPtr); } transPtr = Tcl_FSGetTranslatedPath(NULL,destPathPtr); - Tcl_UtfToExternalDStringEx(NULL, + Tcl_UtfToExternalDStringEx(NULL, NULL, (transPtr != NULL ? TclGetString(transPtr) : NULL), - -1, TCL_ENCODING_NOCOMPLAIN, &dstString); + -1, TCL_ENCODING_PROFILE_TCL8, &dstString, NULL); if (transPtr != NULL) { Tcl_DecrRefCount(transPtr); } @@ -826,9 +826,9 @@ TclpObjRemoveDirectory( int ret; Tcl_Obj *transPtr = Tcl_FSGetTranslatedPath(NULL, pathPtr); - Tcl_UtfToExternalDStringEx(NULL, + Tcl_UtfToExternalDStringEx(NULL, NULL, (transPtr != NULL ? TclGetString(transPtr) : NULL), - -1, TCL_ENCODING_NOCOMPLAIN, &pathString); + -1, TCL_ENCODING_PROFILE_TCL8, &pathString, NULL); if (transPtr != NULL) { Tcl_DecrRefCount(transPtr); } @@ -886,7 +886,7 @@ DoRemoveDirectory( result = TCL_OK; if ((errno != EEXIST) || (recursive == 0)) { if (errorPtr != NULL) { - Tcl_ExternalToUtfDStringEx(NULL, path, TCL_INDEX_NONE, TCL_ENCODING_NOCOMPLAIN, errorPtr); + Tcl_ExternalToUtfDStringEx(NULL, NULL, path, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, errorPtr, NULL); } result = TCL_ERROR; } @@ -1135,7 +1135,7 @@ TraverseUnixTree( end: if (errfile != NULL) { if (errorPtr != NULL) { - Tcl_ExternalToUtfDStringEx(NULL, errfile, TCL_INDEX_NONE, TCL_ENCODING_NOCOMPLAIN, errorPtr); + Tcl_ExternalToUtfDStringEx(NULL, NULL, errfile, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, errorPtr, NULL); } result = TCL_ERROR; } @@ -1205,8 +1205,8 @@ TraversalCopy( */ if (errorPtr != NULL) { - Tcl_ExternalToUtfDStringEx(NULL, Tcl_DStringValue(dstPtr), - Tcl_DStringLength(dstPtr), TCL_ENCODING_NOCOMPLAIN, errorPtr); + Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(dstPtr), + Tcl_DStringLength(dstPtr), TCL_ENCODING_PROFILE_TCL8, errorPtr, NULL); } return TCL_ERROR; } @@ -1256,8 +1256,8 @@ TraversalDelete( break; } if (errorPtr != NULL) { - Tcl_ExternalToUtfDStringEx(NULL, Tcl_DStringValue(srcPtr), - Tcl_DStringLength(srcPtr), TCL_ENCODING_NOCOMPLAIN, errorPtr); + Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(srcPtr), + Tcl_DStringLength(srcPtr), TCL_ENCODING_PROFILE_TCL8, errorPtr, NULL); } return TCL_ERROR; } @@ -1424,7 +1424,7 @@ GetOwnerAttribute( } else { Tcl_DString ds; - Tcl_ExternalToUtfDStringEx(NULL, pwPtr->pw_name, TCL_INDEX_NONE, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_ExternalToUtfDStringEx(NULL, NULL, pwPtr->pw_name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); *attributePtrPtr = Tcl_DStringToObj(&ds); } return TCL_OK; @@ -2086,7 +2086,7 @@ TclpObjNormalizePath( */ Tcl_DStringFree(&ds); - Tcl_ExternalToUtfDStringEx(NULL, normPath, newNormLen, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_ExternalToUtfDStringEx(NULL, NULL, normPath, newNormLen, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); if (path[nextCheckpoint] != '\0') { /* @@ -2179,7 +2179,7 @@ TclUnixOpenTemporaryFile( if (dirObj) { string = Tcl_GetStringFromObj(dirObj, &length); - Tcl_UtfToExternalDStringEx(NULL, string, length, TCL_ENCODING_NOCOMPLAIN, &templ); + Tcl_UtfToExternalDStringEx(NULL, NULL, string, length, TCL_ENCODING_PROFILE_TCL8, &templ, NULL); } else { Tcl_DStringInit(&templ); Tcl_DStringAppend(&templ, DefaultTempDir(), TCL_INDEX_NONE); /* INTL: native */ @@ -2189,7 +2189,7 @@ TclUnixOpenTemporaryFile( if (basenameObj) { string = Tcl_GetStringFromObj(basenameObj, &length); - Tcl_UtfToExternalDStringEx(NULL, string, length, TCL_ENCODING_NOCOMPLAIN, &tmp); + Tcl_UtfToExternalDStringEx(NULL, NULL, string, length, TCL_ENCODING_PROFILE_TCL8, &tmp, NULL); TclDStringAppendDString(&templ, &tmp); Tcl_DStringFree(&tmp); } else { @@ -2201,7 +2201,7 @@ TclUnixOpenTemporaryFile( #ifdef HAVE_MKSTEMPS if (extensionObj) { string = Tcl_GetStringFromObj(extensionObj, &length); - Tcl_UtfToExternalDStringEx(NULL, string, length, TCL_ENCODING_NOCOMPLAIN, &tmp); + Tcl_UtfToExternalDStringEx(NULL, NULL, string, length, TCL_ENCODING_PROFILE_TCL8, &tmp, NULL); TclDStringAppendDString(&templ, &tmp); fd = mkstemps(Tcl_DStringValue(&templ), Tcl_DStringLength(&tmp)); Tcl_DStringFree(&tmp); @@ -2217,8 +2217,8 @@ TclUnixOpenTemporaryFile( } if (resultingNameObj) { - Tcl_ExternalToUtfDStringEx(NULL, Tcl_DStringValue(&templ), - Tcl_DStringLength(&templ), TCL_ENCODING_NOCOMPLAIN, &tmp); + Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(&templ), + Tcl_DStringLength(&templ), TCL_ENCODING_PROFILE_TCL8, &tmp, NULL); Tcl_SetStringObj(resultingNameObj, Tcl_DStringValue(&tmp), Tcl_DStringLength(&tmp)); Tcl_DStringFree(&tmp); @@ -2304,7 +2304,7 @@ TclpCreateTemporaryDirectory( if (dirObj) { string = TclGetString(dirObj); - Tcl_UtfToExternalDStringEx(NULL, string, dirObj->length, TCL_ENCODING_NOCOMPLAIN, &templ); + Tcl_UtfToExternalDStringEx(NULL, NULL, string, dirObj->length, TCL_ENCODING_PROFILE_TCL8, &templ, NULL); } else { Tcl_DStringInit(&templ); Tcl_DStringAppend(&templ, DefaultTempDir(), TCL_INDEX_NONE); /* INTL: native */ @@ -2317,7 +2317,7 @@ TclpCreateTemporaryDirectory( if (basenameObj) { string = TclGetString(basenameObj); if (basenameObj->length) { - Tcl_UtfToExternalDStringEx(NULL, string, basenameObj->length, TCL_ENCODING_NOCOMPLAIN, &tmp); + Tcl_UtfToExternalDStringEx(NULL, NULL, string, basenameObj->length, TCL_ENCODING_PROFILE_TCL8, &tmp, NULL); TclDStringAppendDString(&templ, &tmp); Tcl_DStringFree(&tmp); } else { @@ -2342,8 +2342,8 @@ TclpCreateTemporaryDirectory( * The template has been updated. Tell the caller what it was. */ - Tcl_ExternalToUtfDStringEx(NULL, Tcl_DStringValue(&templ), - Tcl_DStringLength(&templ), TCL_ENCODING_NOCOMPLAIN, &tmp); + Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(&templ), + Tcl_DStringLength(&templ), TCL_ENCODING_PROFILE_TCL8, &tmp, NULL); Tcl_DStringFree(&templ); return Tcl_DStringToObj(&tmp); } diff --git a/unix/tclUnixFile.c b/unix/tclUnixFile.c index 830ed6f..fc297cb 100644 --- a/unix/tclUnixFile.c +++ b/unix/tclUnixFile.c @@ -153,7 +153,7 @@ TclpFindExecutable( #endif { encoding = Tcl_GetEncoding(NULL, NULL); - Tcl_ExternalToUtfDStringEx(encoding, name, TCL_INDEX_NONE, TCL_ENCODING_NOCOMPLAIN, &utfName); + Tcl_ExternalToUtfDStringEx(NULL, encoding, name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &utfName, NULL); TclSetObjNameOfExecutable( Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding); Tcl_DStringFree(&utfName); @@ -179,8 +179,8 @@ TclpFindExecutable( Tcl_DStringAppend(&nameString, name, TCL_INDEX_NONE); Tcl_DStringFree(&buffer); - Tcl_UtfToExternalDStringEx(NULL, Tcl_DStringValue(&cwd), - Tcl_DStringLength(&cwd), TCL_ENCODING_NOCOMPLAIN, &buffer); + Tcl_UtfToExternalDStringEx(NULL, NULL, Tcl_DStringValue(&cwd), + Tcl_DStringLength(&cwd), TCL_ENCODING_PROFILE_TCL8, &buffer, NULL); if (Tcl_DStringValue(&cwd)[Tcl_DStringLength(&cwd) -1] != '/') { TclDStringAppendLiteral(&buffer, "/"); } @@ -189,8 +189,8 @@ TclpFindExecutable( Tcl_DStringFree(&nameString); encoding = Tcl_GetEncoding(NULL, NULL); - Tcl_ExternalToUtfDStringEx(encoding, Tcl_DStringValue(&buffer), TCL_INDEX_NONE, - TCL_ENCODING_NOCOMPLAIN, &utfName); + Tcl_ExternalToUtfDStringEx(NULL, encoding, Tcl_DStringValue(&buffer), TCL_INDEX_NONE, + TCL_ENCODING_PROFILE_TCL8, &utfName, NULL); TclSetObjNameOfExecutable( Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding); Tcl_DStringFree(&utfName); @@ -825,7 +825,7 @@ TclpReadlink( return NULL; } - Tcl_ExternalToUtfDStringEx(NULL, link, length, TCL_ENCODING_NOCOMPLAIN, linkPtr); + Tcl_ExternalToUtfDStringEx(NULL, NULL, link, length, TCL_ENCODING_PROFILE_TCL8, linkPtr, NULL); return Tcl_DStringValue(linkPtr); #else return NULL; @@ -994,7 +994,7 @@ TclpObjLink( return NULL; } - Tcl_ExternalToUtfDStringEx(NULL, link, length, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_ExternalToUtfDStringEx(NULL, NULL, link, length, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); linkPtr = Tcl_DStringToObj(&ds); Tcl_IncrRefCount(linkPtr); return linkPtr; @@ -1059,7 +1059,7 @@ TclpNativeToNormalized( { Tcl_DString ds; - Tcl_ExternalToUtfDStringEx(NULL, (const char *) clientData, TCL_INDEX_NONE, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_ExternalToUtfDStringEx(NULL, NULL, (const char *) clientData, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); return Tcl_DStringToObj(&ds); } @@ -1113,7 +1113,7 @@ TclNativeCreateNativeRep( } str = Tcl_GetStringFromObj(validPathPtr, &len); - Tcl_UtfToExternalDStringEx(NULL, str, len, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_UtfToExternalDStringEx(NULL, NULL, str, len, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); len = Tcl_DStringLength(&ds) + sizeof(char); if (strlen(Tcl_DStringValue(&ds)) < len - sizeof(char)) { /* See bug [3118489]: NUL in filenames */ diff --git a/unix/tclUnixInit.c b/unix/tclUnixInit.c index 8f7a737..71b059a 100644 --- a/unix/tclUnixInit.c +++ b/unix/tclUnixInit.c @@ -473,7 +473,7 @@ TclpInitLibraryPath( */ str = getenv("TCL_LIBRARY"); /* INTL: Native. */ - Tcl_ExternalToUtfDStringEx(NULL, str, TCL_INDEX_NONE, TCL_ENCODING_NOCOMPLAIN, &buffer); + Tcl_ExternalToUtfDStringEx(NULL, NULL, str, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &buffer, NULL); str = Tcl_DStringValue(&buffer); if ((str != NULL) && (str[0] != '\0')) { diff --git a/win/tclWinSock.c b/win/tclWinSock.c index e5c7ee3..4eeeeec 100644 --- a/win/tclWinSock.c +++ b/win/tclWinSock.c @@ -373,8 +373,8 @@ InitializeHostName( Tcl_DStringSetLength(&inDs, 256); if (gethostname(Tcl_DStringValue(&inDs), Tcl_DStringLength(&inDs)) == 0) { - Tcl_ExternalToUtfDStringEx(NULL, Tcl_DStringValue(&inDs), - TCL_INDEX_NONE, TCL_ENCODING_NOCOMPLAIN, &ds); + Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(&inDs), + TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &ds, NULL); } Tcl_DStringFree(&inDs); } |
