diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2022-03-17 09:51:38 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2022-03-17 09:51:38 (GMT) |
commit | cf4140cc14a1800d2a74e73545fd86c49f3a1e73 (patch) | |
tree | e90b82a0260d5ead97aeff9dc4e483bccb3f92de | |
parent | 2041f4450bf1aca29472f00ab21b13448bcde2d4 (diff) | |
parent | 6ec7e10a1634a0a9c10ed2cf90072ba723d701ce (diff) | |
download | tcl-cf4140cc14a1800d2a74e73545fd86c49f3a1e73.zip tcl-cf4140cc14a1800d2a74e73545fd86c49f3a1e73.tar.gz tcl-cf4140cc14a1800d2a74e73545fd86c49f3a1e73.tar.bz2 |
TIP #601: Make "encoding convertto/convertfrom" throw exceptions
-rw-r--r-- | doc/Encoding.3 | 21 | ||||
-rw-r--r-- | generic/tcl.decls | 8 | ||||
-rw-r--r-- | generic/tcl.h | 18 | ||||
-rw-r--r-- | generic/tclCmdAH.c | 103 | ||||
-rw-r--r-- | generic/tclDecls.h | 20 | ||||
-rw-r--r-- | generic/tclEncoding.c | 136 | ||||
-rw-r--r-- | generic/tclStubInit.c | 4 | ||||
-rw-r--r-- | tests/chanio.test | 6 | ||||
-rw-r--r-- | tests/cmdAH.test | 4 | ||||
-rw-r--r-- | tests/encoding.test | 78 | ||||
-rw-r--r-- | tests/http.test | 2 | ||||
-rw-r--r-- | tests/io.test | 10 | ||||
-rw-r--r-- | tests/main.test | 2 | ||||
-rw-r--r-- | tests/safe.test | 8 | ||||
-rw-r--r-- | tests/source.test | 2 |
15 files changed, 342 insertions, 80 deletions
diff --git a/doc/Encoding.3 b/doc/Encoding.3 index c68070c..dc37519 100644 --- a/doc/Encoding.3 +++ b/doc/Encoding.3 @@ -25,9 +25,15 @@ int char * \fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR) .sp +size_t +\fBTcl_ExternalToUtfDStringEx\fR(\fIencoding, src, srcLen, flags, dstPtr\fR) +.sp char * \fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR) .sp +size_t +\fBTcl_UtfToExternalDStringEx\fR(\fIencoding, src, srcLen, flags, dstPtr\fR) +.sp int \fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR) @@ -108,7 +114,10 @@ byte is converted and then to reset to an initial state. \fBTCL_ENCODING_STOPONERROR\fR signifies that the conversion routine should return immediately upon reading a source character that does not exist in the target encoding; otherwise a default fallback character will -automatically be substituted. +automatically be substituted. The flag \fBTCL_ENCODING_NOCOMPLAIN\fR has +no effect, it is reserved for Tcl 9.0. The flag \fBTCL_ENCODING_MODIFIED\fR makes +\fBTcl_UtfToExternalDStringEx\fR and \fBTcl_UtfToExternal\fR produce the +byte sequence \exC0\ex80 in stead of \ex00, for the utf-8/cesu-8 encoders. .AP Tcl_EncodingState *statePtr in/out Used when converting a (generally long or indefinite length) byte stream in a piece-by-piece fashion. The conversion routine stores its current @@ -208,6 +217,11 @@ When converting, if any of the characters in the source buffer cannot be represented in the target encoding, a default fallback character will be used. The return value is a pointer to the value stored in the DString. .PP +\fBTcl_ExternalToUtfDStringEx\fR is the same as \fBTcl_ExternalToUtfDString\fR, +but it has an additional flags parameter. The return value is the index of +the first byte in the input string causing a conversion error. +Or (size_t)-1 if all is OK. +.PP \fBTcl_ExternalToUtf\fR converts a source buffer \fIsrc\fR from the specified \fIencoding\fR into UTF-8. Up to \fIsrcLen\fR bytes are converted from the source buffer and up to \fIdstLen\fR converted bytes are stored in \fIdst\fR. @@ -246,6 +260,11 @@ characters in the source buffer cannot be represented in the target encoding, a default fallback character will be used. The return value is a pointer to the value stored in the DString. .PP +\fBTcl_UtfToExternalDStringEx\fR is the same as \fBTcl_UtfToExternalDString\fR, +but it has an additional flags parameter. The return value is the index of +the first byte of an utf-8 byte-sequence in the input string causing a +conversion error. Or (size_t)-1 if all is OK. +.PP \fBTcl_UtfToExternal\fR converts a source buffer \fIsrc\fR from UTF-8 into the specified \fIencoding\fR. Up to \fIsrcLen\fR bytes are converted from the source buffer and up to \fIdstLen\fR converted bytes are stored in diff --git a/generic/tcl.decls b/generic/tcl.decls index 8e21b1d..a33ea56 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2436,6 +2436,14 @@ declare 656 { declare 657 { int Tcl_UniCharIsUnicode(int ch) } +declare 658 { + size_t Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, + const char *src, int srcLen, int flags, Tcl_DString *dsPtr) +} +declare 659 { + size_t Tcl_UtfToExternalDStringEx(Tcl_Encoding encoding, + const char *src, int srcLen, int flags, Tcl_DString *dsPtr) +} # TIP #511 declare 660 { diff --git a/generic/tcl.h b/generic/tcl.h index b82cf0a..ef0fa75 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2059,10 +2059,10 @@ typedef struct Tcl_EncodingType { * encountering an invalid byte sequence or a * source character that has no mapping in the * target encoding. If clear, the converter - * substitues the problematic character(s) with + * substitutes the problematic character(s) with * one or more "close" characters in the * destination buffer and then continues to - * convert the source. + * convert the source. Only for Tcl 8.x. * TCL_ENCODING_NO_TERMINATE - If set, Tcl_ExternalToUtf does not append a * terminating NUL byte. Since it does not need * an extra byte for a terminating NUL, it fills @@ -2077,6 +2077,18 @@ typedef struct Tcl_EncodingType { * content. Otherwise, the number of chars * produced is controlled only by other limiting * factors. + * TCL_ENCODING_MODIFIED - Convert NULL bytes to \xC0\x80 in stead of + * 0x00. Only valid for "utf-8" and "cesu-8". + * This flag is implicit for external -> internal conversions, + * optional for internal -> external conversions. + * TCL_ENCODING_NOCOMPLAIN - If set, the converter + * substitutes the problematic character(s) with + * one or more "close" characters in the + * destination buffer and then continues to + * convert the source. If clear, the converter returns + * immediately upon encountering an invalid byte sequence + * or a source character that has no mapping in the + * target encoding. Only for Tcl 9.x. */ #define TCL_ENCODING_START 0x01 @@ -2084,6 +2096,8 @@ typedef struct Tcl_EncodingType { #define TCL_ENCODING_STOPONERROR 0x04 #define TCL_ENCODING_NO_TERMINATE 0x08 #define TCL_ENCODING_CHAR_LIMIT 0x10 +#define TCL_ENCODING_MODIFIED 0x20 +#define TCL_ENCODING_NOCOMPLAIN 0x40 /* * The following definitions are the error codes returned by the conversion diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 8e6200d..60a2c42 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -513,8 +513,8 @@ TclInitEncodingCmd( Tcl_Interp* interp) /* Tcl interpreter */ { static const EnsembleImplMap encodingImplMap[] = { - {"convertfrom", EncodingConvertfromObjCmd, TclCompileBasic1Or2ArgCmd, NULL, NULL, 0}, - {"convertto", EncodingConverttoObjCmd, TclCompileBasic1Or2ArgCmd, NULL, NULL, 0}, + {"convertfrom", EncodingConvertfromObjCmd, TclCompileBasic1To3ArgCmd, NULL, NULL, 0}, + {"convertto", EncodingConverttoObjCmd, TclCompileBasic1To3ArgCmd, NULL, NULL, 0}, {"dirs", EncodingDirsObjCmd, TclCompileBasic0Or1ArgCmd, NULL, NULL, 1}, {"names", EncodingNamesObjCmd, TclCompileBasic0ArgCmd, NULL, NULL, 0}, {"system", EncodingSystemObjCmd, TclCompileBasic0Or1ArgCmd, NULL, NULL, 1}, @@ -550,25 +550,66 @@ EncodingConvertfromObjCmd( Tcl_Encoding encoding; /* Encoding to use */ int length; /* Length of the byte array being converted */ const char *bytesPtr; /* Pointer to the first byte of the array */ +#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) + int flags = TCL_ENCODING_STOPONERROR; +#else + int flags = TCL_ENCODING_NOCOMPLAIN; +#endif + size_t result; if (objc == 2) { encoding = Tcl_GetEncoding(interp, NULL); data = objv[1]; - } else if (objc == 3) { - if (Tcl_GetEncodingFromObj(interp, objv[1], &encoding) != TCL_OK) { + } else if ((unsigned)(objc - 2) < 3) { + data = objv[objc - 1]; + bytesPtr = Tcl_GetString(objv[1]); + if (bytesPtr[0] == '-' && bytesPtr[1] == 'n' + && !strncmp(bytesPtr, "-nocomplain", strlen(bytesPtr))) { + flags = TCL_ENCODING_NOCOMPLAIN; + } else if (objc < 4) { + if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { + return TCL_ERROR; + } + goto encConvFromOK; + } else { + goto encConvFromError; + } + if (objc < 4) { + encoding = Tcl_GetEncoding(interp, NULL); + } else if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { return TCL_ERROR; } - data = objv[2]; } else { - Tcl_WrongNumArgs(interp, 1, objv, "?encoding? data"); + encConvFromError: + Tcl_WrongNumArgs(interp, 1, objv, "?-nocomplain? ?encoding? data"); return TCL_ERROR; } +encConvFromOK: /* * Convert the string into a byte array in 'ds' */ - bytesPtr = (char *) Tcl_GetByteArrayFromObj(data, &length); - Tcl_ExternalToUtfDString(encoding, bytesPtr, length, &ds); +#if !defined(TCL_NO_DEPRECATED) && (TCL_MAJOR_VERSION < 9) + if (!(flags & TCL_ENCODING_STOPONERROR)) { + bytesPtr = (char *) Tcl_GetByteArrayFromObj(data, &length); + } else +#endif + bytesPtr = (char *) TclGetBytesFromObj(interp, data, &length); + if (bytesPtr == NULL) { + return TCL_ERROR; + } + result = Tcl_ExternalToUtfDStringEx(encoding, bytesPtr, length, + flags, &ds); + if ((flags & TCL_ENCODING_STOPONERROR) && (result != (size_t)-1)) { + char buf[TCL_INTEGER_SPACE]; + sprintf(buf, "%" TCL_Z_MODIFIER "u", result); + Tcl_SetObjResult(interp, Tcl_ObjPrintf("unexpected byte sequence starting at index %" + TCL_Z_MODIFIER "u: '\\x%X'", result, UCHAR(bytesPtr[result]))); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", + buf, NULL); + Tcl_DStringFree(&ds); + return TCL_ERROR; + } /* * Note that we cannot use Tcl_DStringResult here because it will @@ -612,26 +653,62 @@ EncodingConverttoObjCmd( Tcl_Encoding encoding; /* Encoding to use */ int length; /* Length of the string being converted */ const char *stringPtr; /* Pointer to the first byte of the string */ + size_t result; +#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) + int flags = TCL_ENCODING_STOPONERROR; +#else + int flags = TCL_ENCODING_NOCOMPLAIN; +#endif if (objc == 2) { encoding = Tcl_GetEncoding(interp, NULL); data = objv[1]; - } else if (objc == 3) { - if (Tcl_GetEncodingFromObj(interp, objv[1], &encoding) != TCL_OK) { + } else if ((unsigned)(objc - 2) < 3) { + data = objv[objc - 1]; + stringPtr = Tcl_GetString(objv[1]); + if (stringPtr[0] == '-' && stringPtr[1] == 'n' + && !strncmp(stringPtr, "-nocomplain", strlen(stringPtr))) { + flags = TCL_ENCODING_NOCOMPLAIN; + } else if (objc < 4) { + if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { + return TCL_ERROR; + } + goto encConvToOK; + } else { + goto encConvToError; + } + if (objc < 4) { + encoding = Tcl_GetEncoding(interp, NULL); + } else if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { return TCL_ERROR; } - data = objv[2]; } else { - Tcl_WrongNumArgs(interp, 1, objv, "?encoding? data"); + encConvToError: + Tcl_WrongNumArgs(interp, 1, objv, "?-nocomplain? ?encoding? data"); return TCL_ERROR; } +encConvToOK: /* * Convert the string to a byte array in 'ds' */ stringPtr = TclGetStringFromObj(data, &length); - Tcl_UtfToExternalDString(encoding, stringPtr, length, &ds); + result = Tcl_UtfToExternalDStringEx(encoding, stringPtr, length, + flags, &ds); + if ((flags & TCL_ENCODING_STOPONERROR) && (result != (size_t)-1)) { + size_t pos = Tcl_NumUtfChars(stringPtr, result); + int ucs4; + char buf[TCL_INTEGER_SPACE]; + TclUtfToUCS4(&stringPtr[result], &ucs4); + sprintf(buf, "%" TCL_Z_MODIFIER "u", result); + Tcl_SetObjResult(interp, Tcl_ObjPrintf("unexpected character at index %" + TCL_Z_MODIFIER "u: 'U+%06X'", pos, ucs4)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", + buf, NULL); + Tcl_DStringFree(&ds); + return TCL_ERROR; + } Tcl_SetObjResult(interp, Tcl_NewByteArrayObj((unsigned char*) Tcl_DStringValue(&ds), Tcl_DStringLength(&ds))); diff --git a/generic/tclDecls.h b/generic/tclDecls.h index e84a7e8..0830a11 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1943,8 +1943,14 @@ EXTERN const char * Tcl_UtfNext(const char *src); EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); /* 657 */ EXTERN int Tcl_UniCharIsUnicode(int ch); -/* Slot 658 is reserved */ -/* Slot 659 is reserved */ +/* 658 */ +EXTERN size_t Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, + const char *src, int srcLen, int flags, + Tcl_DString *dsPtr); +/* 659 */ +EXTERN size_t Tcl_UtfToExternalDStringEx(Tcl_Encoding encoding, + const char *src, int srcLen, int flags, + Tcl_DString *dsPtr); /* 660 */ EXTERN int Tcl_AsyncMarkFromSignal(Tcl_AsyncHandler async, int sigNumber); @@ -2650,8 +2656,8 @@ typedef struct TclStubs { const char * (*tcl_UtfNext) (const char *src); /* 655 */ const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ - void (*reserved658)(void); - void (*reserved659)(void); + size_t (*tcl_ExternalToUtfDStringEx) (Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr); /* 658 */ + size_t (*tcl_UtfToExternalDStringEx) (Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr); /* 659 */ int (*tcl_AsyncMarkFromSignal) (Tcl_AsyncHandler async, int sigNumber); /* 660 */ void (*reserved661)(void); void (*reserved662)(void); @@ -4007,8 +4013,10 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UtfPrev) /* 656 */ #define Tcl_UniCharIsUnicode \ (tclStubsPtr->tcl_UniCharIsUnicode) /* 657 */ -/* Slot 658 is reserved */ -/* Slot 659 is reserved */ +#define Tcl_ExternalToUtfDStringEx \ + (tclStubsPtr->tcl_ExternalToUtfDStringEx) /* 658 */ +#define Tcl_UtfToExternalDStringEx \ + (tclStubsPtr->tcl_UtfToExternalDStringEx) /* 659 */ #define Tcl_AsyncMarkFromSignal \ (tclStubsPtr->tcl_AsyncMarkFromSignal) /* 660 */ /* Slot 661 is reserved */ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 4630a02..b6d5dcf 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -515,10 +515,8 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ /* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ -#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ @@ -1144,10 +1142,56 @@ Tcl_ExternalToUtfDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, 0, dstPtr); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_ExternalToUtfDStringEx -- + * + * Convert a source buffer from the specified encoding into UTF-8. +* The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in utf-8. + * Possible flags values: + * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but + * return the first error position (Default in Tcl 9.0). + * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default + * fallback character. Always return -1 (Default in Tcl 8.7). + * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. + * Only valid for "utf-8" and "cesu-8". This flag may be used together + * with the other flags. + * + * Results: + * The converted bytes are stored in the DString, which is then NULL + * terminated in an encoding-specific manner. The return value is + * the error position in the source string or -1 if no conversion error + * is reported. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +size_t +Tcl_ExternalToUtfDStringEx( + Tcl_Encoding encoding, /* The encoding for the source string, or NULL + * for the default system encoding. */ + const char *src, /* Source string in specified encoding. */ + int srcLen, /* Source string length in bytes, or < 0 for + * encoding-specific string length. */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + * converted string is stored. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; + int dstLen, result, soFar, srcRead, dstWrote, dstChars; + const char *srcStart = src; Tcl_DStringInit(dstPtr); dst = Tcl_DStringValue(dstPtr); @@ -1164,7 +1208,7 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } @@ -1177,7 +1221,7 @@ Tcl_ExternalToUtfDString( src += srcRead; if (result != TCL_CONVERT_NOSPACE) { Tcl_DStringSetLength(dstPtr, soFar); - return Tcl_DStringValue(dstPtr); + return (result == TCL_OK) ? (size_t)-1 : (size_t)(src - srcStart); } flags &= ~TCL_ENCODING_START; srcLen -= srcRead; @@ -1336,10 +1380,57 @@ Tcl_UtfToExternalDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { + Tcl_UtfToExternalDStringEx(encoding, src, srcLen, 0, dstPtr); + return Tcl_DStringValue(dstPtr); +} + + +/* + *------------------------------------------------------------------------- + * + * Tcl_UtfToExternalDStringEx -- + * + * Convert a source buffer from UTF-8 to the specified encoding. + * The parameter flags controls the behavior, if any of the bytes in + * the source buffer are invalid or cannot be represented in the + * target encoding. + * Possible flags values: + * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but + * return the first error position (Default in Tcl 9.0). + * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default + * fallback character. Always return -1 (Default in Tcl 8.7). + * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. + * Only valid for "utf-8" and "cesu-8". This flag may be used together + * with the other flags. + * + * Results: + * The converted bytes are stored in the DString, which is then NULL + * terminated in an encoding-specific manner. The return value is + * the error position in the source string or -1 if no conversion error + * is reported. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +size_t +Tcl_UtfToExternalDStringEx( + Tcl_Encoding encoding, /* The encoding for the converted string, or + * NULL for the default system encoding. */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes, or < 0 for + * strlen(). */ + int flags, /* Conversion control flags. */ + Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + * converted string is stored. */ +{ char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; + int dstLen, result, soFar, srcRead, dstWrote, dstChars; + const char *srcStart = src; Tcl_DStringInit(dstPtr); dst = Tcl_DStringValue(dstPtr); @@ -1355,7 +1446,7 @@ Tcl_UtfToExternalDString( } else if (srcLen < 0) { srcLen = strlen(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END; + flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, &state, dst, dstLen, @@ -1368,7 +1459,7 @@ Tcl_UtfToExternalDString( while (i >= soFar) { Tcl_DStringSetLength(dstPtr, i--); } - return Tcl_DStringValue(dstPtr); + return (result == TCL_OK) ? (size_t)-1 : (size_t)(src - srcStart); } flags &= ~TCL_ENCODING_START; @@ -2196,6 +2287,12 @@ BinaryProc( *------------------------------------------------------------------------- */ +#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) +# define STOPONERROR !(flags & TCL_ENCODING_NOCOMPLAIN) +#else +# define STOPONERROR (flags & TCL_ENCODING_STOPONERROR) +#endif + static int UtfToUtfProc( ClientData clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ @@ -2278,7 +2375,7 @@ UtfToUtfProc( */ if (flags & TCL_ENCODING_MODIFIED) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_MULTIBYTE; break; } @@ -2293,7 +2390,7 @@ UtfToUtfProc( int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); - if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR) + if ((len < 2) && (ch != 0) && STOPONERROR && (flags & TCL_ENCODING_MODIFIED)) { result = TCL_CONVERT_SYNTAX; break; @@ -2318,7 +2415,8 @@ UtfToUtfProc( len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - if (flags & TCL_ENCODING_STOPONERROR) { + + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2333,7 +2431,7 @@ UtfToUtfProc( dst += Tcl_UniCharToUtf(ch, dst); ch = low; } else if (!Tcl_UniCharIsUnicode(ch)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2519,7 +2617,7 @@ UtfToUtf32Proc( } len = TclUtfToUCS4(src, &ch); if (!Tcl_UniCharIsUnicode(ch)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -2722,7 +2820,7 @@ UtfToUtf16Proc( } len = TclUtfToUCS4(src, &ch); if (!Tcl_UniCharIsUnicode(ch)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -2942,7 +3040,7 @@ TableToUtfProc( ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_SYNTAX; break; } @@ -3058,7 +3156,7 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3246,7 +3344,7 @@ Iso88591FromUtfProc( || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3473,7 +3571,7 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if ((flags & TCL_ENCODING_STOPONERROR) == 0) { + if (!STOPONERROR) { /* * Skip the unknown escape sequence. */ @@ -3648,7 +3746,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (flags & TCL_ENCODING_STOPONERROR) { + if (STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 221ff67..257c3ce 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1939,8 +1939,8 @@ const TclStubs tclStubs = { Tcl_UtfNext, /* 655 */ Tcl_UtfPrev, /* 656 */ Tcl_UniCharIsUnicode, /* 657 */ - 0, /* 658 */ - 0, /* 659 */ + Tcl_ExternalToUtfDStringEx, /* 658 */ + Tcl_UtfToExternalDStringEx, /* 659 */ Tcl_AsyncMarkFromSignal, /* 660 */ 0, /* 661 */ 0, /* 662 */ diff --git a/tests/chanio.test b/tests/chanio.test index 2d26ac9..578dc9f 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -249,7 +249,7 @@ test chan-io-3.3 {WriteChars: compatibility with WriteBytes: flush on line} -bod } -cleanup { chan close $f } -result "\r\n12" -test chan-io-3.4 {WriteChars: loop over stage buffer} { +test chan-io-3.4 {WriteChars: loop over stage buffer} deprecated { # stage buffer maps to more than can be queued at once. set f [open $path(test1) w] chan configure $f -encoding jis0208 -buffersize 16 @@ -258,7 +258,7 @@ test chan-io-3.4 {WriteChars: loop over stage buffer} { chan close $f lappend x [contents $path(test1)] } [list "!)!)!)!)!)!)!)!)" "!)!)!)!)!)!)!)!)!)!)!)!)!)!)!)"] -test chan-io-3.5 {WriteChars: saved != 0} { +test chan-io-3.5 {WriteChars: saved != 0} deprecated { # Bytes produced by UtfToExternal from end of last channel buffer had to # be moved to beginning of next channel buffer to preserve requested # buffersize. @@ -285,7 +285,7 @@ test chan-io-3.6 {WriteChars: (stageRead + dstWrote == 0)} { chan close $f lappend x [contents $path(test1)] } [list "12345678901234\x82\x60" "12345678901234\x82\x60\x82\x61"] -test chan-io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} { +test chan-io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} deprecated { # When translating UTF-8 to external, the produced bytes went past end of # the channel buffer. This is done on purpose - we then truncate the bytes # at the end of the partial character to preserve the requested blocksize diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 5fefbeb..d787c7f 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -178,7 +178,7 @@ test cmdAH-4.2 {Tcl_EncodingObjCmd} -returnCodes error -body { } -result {unknown or ambiguous subcommand "foo": must be convertfrom, convertto, dirs, names, or system} test cmdAH-4.3 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertto -} -result {wrong # args: should be "encoding convertto ?encoding? data"} +} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?encoding? data"} test cmdAH-4.4 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertto foo bar } -result {unknown encoding "foo"} @@ -200,7 +200,7 @@ test cmdAH-4.6 {Tcl_EncodingObjCmd} -setup { } -result 8C test cmdAH-4.7 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertfrom -} -result {wrong # args: should be "encoding convertfrom ?encoding? data"} +} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?encoding? data"} test cmdAH-4.8 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertfrom foo bar } -result {unknown encoding "foo"} diff --git a/tests/encoding.test b/tests/encoding.test index 75e0dcc..bf82493 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -22,6 +22,8 @@ catch { package require -exact tcl::test [info patchlevel] } +testConstraint deprecated [expr {![info exists tcl_precision]}] + proc toutf {args} { variable x lappend x "toutf $args" @@ -297,7 +299,7 @@ test encoding-11.11 {encoding: extended Unicode UTF-32} { test encoding-12.1 {LoadTableEncoding: normal encoding} { set x [encoding convertto iso8859-3 Ġ] - append x [encoding convertto iso8859-3 Õ] + append x [encoding convertto -nocomplain iso8859-3 Õ] append x [encoding convertfrom iso8859-3 Õ] } "Õ?Ġ" test encoding-12.2 {LoadTableEncoding: single-byte encoding} { @@ -346,67 +348,67 @@ test encoding-15.5 {UtfToUtfProc emoji character input} { } "4 😂" test encoding-15.6 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D - set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] + set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {10 edb882f09f9882eda0bd} test encoding-15.7 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D - set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] + set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 edb882eda0bdeda0bd} test encoding-15.8 {UtfToUtfProc emoji character output} { set x \uDE02\uD83Dé - set y [encoding convertto utf-8 \uDE02\uD83Dé] + set y [encoding convertto -nocomplain utf-8 \uDE02\uD83Dé] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 edb882eda0bdc3a9} test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX - set y [encoding convertto utf-8 \uDE02\uD83DX] + set y [encoding convertto -nocomplain utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 edb882eda0bd58} test encoding-15.10 {UtfToUtfProc high surrogate character output} { set x \uDE02é - set y [encoding convertto utf-8 \uDE02é] + set y [encoding convertto -nocomplain utf-8 \uDE02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 edb882c3a9} test encoding-15.11 {UtfToUtfProc low surrogate character output} { set x \uDA02é - set y [encoding convertto utf-8 \uDA02é] + set y [encoding convertto -nocomplain utf-8 \uDA02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 eda882c3a9} test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02Y - set y [encoding convertto utf-8 \uDE02Y] + set y [encoding convertto -nocomplain utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 edb88259} test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02Y - set y [encoding convertto utf-8 \uDA02Y] + set y [encoding convertto -nocomplain utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 eda88259} test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02 - set y [encoding convertto utf-8 \uDE02] + set y [encoding convertto -nocomplain utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 edb882} test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02 - set y [encoding convertto utf-8 \uDA02] + set y [encoding convertto -nocomplain utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 eda882} test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { set x \xF0\xA0\xA1\xC2 - set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2] + set y [encoding convertfrom -nocomplain utf-8 \xF0\xA0\xA1\xC2] list [string length $x] $y } "4 \xF0\xA0\xA1\xC2" test encoding-15.17 {UtfToUtfProc emoji character output} { @@ -487,10 +489,10 @@ test encoding-17.2 {UtfToUcs2Proc} -body { encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"] } -result "\uFFFD" test encoding-17.3 {UtfToUtf16Proc} -body { - encoding convertto utf-16be "\uDCDC" + encoding convertto -nocomplain utf-16be "\uDCDC" } -result "\xFF\xFD" test encoding-17.4 {UtfToUtf16Proc} -body { - encoding convertto utf-16le "\uD8D8" + encoding convertto -nocomplain utf-16le "\uD8D8" } -result "\xFD\xFF" test encoding-17.5 {UtfToUtf16Proc} -body { encoding convertto utf-32le "\U460DC" @@ -615,26 +617,62 @@ test encoding-24.4 {Parse valid or invalid utf-8} { string length [encoding convertfrom utf-8 "\xC0\x80"] } 1 test encoding-24.5 {Parse valid or invalid utf-8} { - string length [encoding convertfrom utf-8 "\xC0\x81"] + string length [encoding convertfrom -nocomplain utf-8 "\xC0\x81"] } 2 test encoding-24.6 {Parse valid or invalid utf-8} { - string length [encoding convertfrom utf-8 "\xC1\xBF"] + string length [encoding convertfrom -nocomplain utf-8 "\xC1\xBF"] } 2 test encoding-24.7 {Parse valid or invalid utf-8} { string length [encoding convertfrom utf-8 "\xC2\x80"] } 1 test encoding-24.8 {Parse valid or invalid utf-8} { - string length [encoding convertfrom utf-8 "\xE0\x80\x80"] + string length [encoding convertfrom -nocomplain utf-8 "\xE0\x80\x80"] } 3 test encoding-24.9 {Parse valid or invalid utf-8} { - string length [encoding convertfrom utf-8 "\xE0\x9F\xBF"] + string length [encoding convertfrom -nocomplain utf-8 "\xE0\x9F\xBF"] } 3 test encoding-24.10 {Parse valid or invalid utf-8} { string length [encoding convertfrom utf-8 "\xE0\xA0\x80"] } 1 test encoding-24.11 {Parse valid or invalid utf-8} { - string length [encoding convertfrom utf-8 "\xEF\xBF\xBF"] + string length [encoding convertfrom -nocomplain utf-8 "\xEF\xBF\xBF"] +} 1 +test encoding-24.12 {Parse valid or invalid utf-8} -constraints deprecated -body { + encoding convertfrom utf-8 "\xC0\x81" +} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xC0'} +test encoding-24.13 {Parse valid or invalid utf-8} -constraints deprecated -body { + encoding convertfrom utf-8 "\xC1\xBF" +} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xC1'} +test encoding-24.14 {Parse valid or invalid utf-8} { + string length [encoding convertfrom utf-8 "\xC2\x80"] +} 1 +test encoding-24.15 {Parse valid or invalid utf-8} -constraints deprecated -body { + encoding convertfrom utf-8 "Z\xE0\x80" +} -returnCodes 1 -result {unexpected byte sequence starting at index 1: '\xE0'} +test encoding-24.16 {Parse valid or invalid utf-8} -constraints {testbytestring deprecated} -body { + encoding convertto utf-8 [testbytestring "Z\u4343\x80"] +} -returnCodes 1 -result {expected byte sequence but character 1 was '䍃' (U+004343)} +test encoding-24.17 {Parse valid or invalid utf-8} -constraints {testbytestring deprecated} -body { + encoding convertto utf-8 [testbytestring "Z\xE0\x80"] +} -result "Z\xC3\xA0\xE2\x82\xAC" +test encoding-24.18 {Parse valid or invalid utf-8} -constraints {testbytestring deprecated} -body { + encoding convertto utf-8 [testbytestring "Z\xE0\x80xxxxxx"] +} -result "Z\xC3\xA0\xE2\x82\xACxxxxxx" +test encoding-24.19 {Parse valid or invalid utf-8} -constraints deprecated -body { + encoding convertto utf-8 "ZX\uD800" +} -returnCodes 1 -match glob -result "unexpected character at index 2: 'U+00D800'" +test encoding-24.20 {Parse with -nocomplain but without providing encoding} { + string length [encoding convertfrom -nocomplain "\x20"] +} 1 +test encoding-24.21 {Parse with -nocomplain but without providing encoding} { + string length [encoding convertto -nocomplain "\x20"] } 1 +test encoding-24.22 {Syntax error, two encodings} -body { + encoding convertfrom iso8859-1 utf-8 "ZX\uD800" +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-nocomplain? ?encoding? data"} +test encoding-24.23 {Syntax error, two encodings} -body { + encoding convertto iso8859-1 utf-8 "ZX\uD800" +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-nocomplain? ?encoding? data"} file delete [file join [temporaryDirectory] iso2022.txt] @@ -790,7 +828,7 @@ test encoding-28.0 {all encodings load} -body { set string hello foreach name [encoding names] { incr count - encoding convertto $name $string + encoding convertto -nocomplain $name $string # discard the cached internal representation of Tcl_Encoding # Unfortunately, without this, encoding 2-1 fails. diff --git a/tests/http.test b/tests/http.test index 2fd5af4..93998fe 100644 --- a/tests/http.test +++ b/tests/http.test @@ -661,7 +661,7 @@ test http-7.3 {http::formatQuery} -setup { } -cleanup { http::config -urlencoding $enc } -result "can't read \"formMap(∈)\": no such element in array" -test http-7.4 {http::formatQuery} -setup { +test http-7.4 {http::formatQuery} -constraints deprecated -setup { set enc [http::config -urlencoding] } -body { # this would be reverting to http <=2.4 behavior w/o errors diff --git a/tests/io.test b/tests/io.test index 0ef3422..821b11e 100644 --- a/tests/io.test +++ b/tests/io.test @@ -268,7 +268,7 @@ test io-3.3 {WriteChars: compatibility with WriteBytes: flush on line} { close $f set x } "\r\n12" -test io-3.4 {WriteChars: loop over stage buffer} { +test io-3.4 {WriteChars: loop over stage buffer} deprecated { # stage buffer maps to more than can be queued at once. set f [open $path(test1) w] @@ -278,7 +278,7 @@ test io-3.4 {WriteChars: loop over stage buffer} { close $f lappend x [contents $path(test1)] } [list "!)!)!)!)!)!)!)!)" "!)!)!)!)!)!)!)!)!)!)!)!)!)!)!)"] -test io-3.5 {WriteChars: saved != 0} { +test io-3.5 {WriteChars: saved != 0} deprecated { # Bytes produced by UtfToExternal from end of last channel buffer # had to be moved to beginning of next channel buffer to preserve # requested buffersize. @@ -307,7 +307,7 @@ test io-3.6 {WriteChars: (stageRead + dstWrote == 0)} { close $f lappend x [contents $path(test1)] } [list "12345678901234\x82\x60" "12345678901234\x82\x60\x82\x61"] -test io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} { +test io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} deprecated { # When translating UTF-8 to external, the produced bytes went past end # of the channel buffer. This is done purpose -- we then truncate the # bytes at the end of the partial character to preserve the requested @@ -1532,7 +1532,7 @@ test io-12.8 {ReadChars: multibyte chars split} { close $f scan [string index $in end] %c } 160 -test io-12.9 {ReadChars: multibyte chars split} { +test io-12.9 {ReadChars: multibyte chars split} deprecated { set f [open $path(test1) w] fconfigure $f -translation binary puts -nonewline $f [string repeat a 9]\xC2 @@ -1543,7 +1543,7 @@ test io-12.9 {ReadChars: multibyte chars split} { close $f scan [string index $in end] %c } 194 -test io-12.10 {ReadChars: multibyte chars split} { +test io-12.10 {ReadChars: multibyte chars split} deprecated { set f [open $path(test1) w] fconfigure $f -translation binary puts -nonewline $f [string repeat a 9]\xC2 diff --git a/tests/main.test b/tests/main.test index 2d3f63c..ef58b6f 100644 --- a/tests/main.test +++ b/tests/main.test @@ -143,7 +143,7 @@ namespace eval ::tcl::test::main { test Tcl_Main-1.8 { Tcl_Main: startup script - -encoding option - mismatched encodings } -constraints { - stdio + stdio deprecated } -setup { set script [makeFile {} script] file delete $script diff --git a/tests/safe.test b/tests/safe.test index 773b16f..5f3eae8 100644 --- a/tests/safe.test +++ b/tests/safe.test @@ -1269,7 +1269,7 @@ test safe-11.7 {testing safe encoding} -setup { interp eval $i encoding convertfrom } -returnCodes error -cleanup { safe::interpDelete $i -} -result {wrong # args: should be "encoding convertfrom ?encoding? data"} +} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?encoding? data"} test safe-11.7.1 {testing safe encoding} -setup { set i [safe::interpCreate] } -body { @@ -1278,7 +1278,7 @@ test safe-11.7.1 {testing safe encoding} -setup { } -returnCodes ok -match glob -cleanup { unset -nocomplain m o safe::interpDelete $i -} -result {wrong # args: should be "encoding convertfrom ?encoding? data" +} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?encoding? data" while executing "encoding convertfrom" invoked from within @@ -1291,7 +1291,7 @@ test safe-11.8 {testing safe encoding} -setup { interp eval $i encoding convertto } -returnCodes error -cleanup { safe::interpDelete $i -} -result {wrong # args: should be "encoding convertto ?encoding? data"} +} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?encoding? data"} test safe-11.8.1 {testing safe encoding} -setup { set i [safe::interpCreate] } -body { @@ -1300,7 +1300,7 @@ test safe-11.8.1 {testing safe encoding} -setup { } -returnCodes ok -match glob -cleanup { unset -nocomplain m o safe::interpDelete $i -} -result {wrong # args: should be "encoding convertto ?encoding? data" +} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?encoding? data" while executing "encoding convertto" invoked from within diff --git a/tests/source.test b/tests/source.test index eee03ec..d67bec5 100644 --- a/tests/source.test +++ b/tests/source.test @@ -275,7 +275,7 @@ test source-7.5 {source -encoding: correct operation} -setup { removeFile source.file rename € {} } -result foo -test source-7.6 {source -encoding: mismatch encoding error} -setup { +test source-7.6 {source -encoding: mismatch encoding error} -constraints deprecated -setup { set sourcefile [makeFile {} source.file] file delete $sourcefile set f [open $sourcefile w] |