Merge 8.7

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2023-02-14 22:06:32 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2023-02-14 22:06:32 (GMT)
commit: fb260e838200ee3007a853db81e2e9f775ebb783 (patch)
tree: 4bfe1e1a8a62a40a77ae74f373e4c10742218f0e
parent: 0c306c8f81e856f8992e427ec5a1b9611a7c0434 (diff)
parent: 5040a69c28a92e0318ca56498bdf30f8690470ae (diff)
download: tcl-fb260e838200ee3007a853db81e2e9f775ebb783.zip
tcl-fb260e838200ee3007a853db81e2e9f775ebb783.tar.gz
tcl-fb260e838200ee3007a853db81e2e9f775ebb783.tar.bz2
2 files changed, 60 insertions, 17 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 6990346..9d896e2 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -525,7 +525,8 @@ FillEncodingFileMap(void)
 /* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
  * TCL_ENCODING_LE is only used for  utf-16/utf-32/ucs-2. re-use the same value */
 #define TCL_ENCODING_LE		TCL_ENCODING_MODIFIED	/* Little-endian encoding */
-#define TCL_ENCODING_UTF	0x200	/* For UTF-8 encoding, allow 4-byte output sequences */
+#define ENCODING_UTF	0x200	/* For UTF-8 encoding, allow 4-byte output sequences */
+#define ENCODING_INPUT	0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */
 
 void
 TclInitEncodingSubsystem(void)
@@ -567,7 +568,7 @@ TclInitEncodingSubsystem(void)
     type.fromUtfProc	= UtfToUtfProc;
     type.freeProc	= NULL;
     type.nullSize	= 1;
-    type.clientData	= INT2PTR(TCL_ENCODING_UTF);
+    type.clientData	= INT2PTR(ENCODING_UTF);
     Tcl_CreateEncoding(&type);
     type.clientData	= INT2PTR(TCL_ENCODING_NOCOMPLAIN);
     type.encodingName	= "cesu-8";
@@ -1180,7 +1181,7 @@ Tcl_ExternalToUtfDStringEx(
 
     flags |= TCL_ENCODING_START | TCL_ENCODING_END;
     if (encodingPtr->toUtfProc == UtfToUtfProc) {
-	flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+	flags |= ENCODING_INPUT;
     }
 
     while (1) {
@@ -1297,7 +1298,7 @@ Tcl_ExternalToUtf(
 	dstLen--;
     }
     if (encodingPtr->toUtfProc == UtfToUtfProc) {
-	flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+	flags |= ENCODING_INPUT;
     }
     do {
 	Tcl_EncodingState savedState = *statePtr;
@@ -1390,7 +1391,7 @@ Tcl_UtfToExternalDStringEx(
     const char *src,		/* Source string in UTF-8. */
     size_t srcLen,			/* Source string length in bytes, or < 0 for
 				 * strlen(). */
-    int flags,	/* Conversion control flags. */
+    int flags,			/* Conversion control flags. */
     Tcl_DString *dstPtr)	/* Uninitialized or free DString in which the
 				 * converted string is stored. */
 {
@@ -2300,7 +2301,7 @@ UtfToUtfProc(
 
     dstStart = dst;
     flags |= PTR2INT(clientData);
-    dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
+    dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6);
 
     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -2316,7 +2317,7 @@ UtfToUtfProc(
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) {
+	if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) {
 	    /*
 	     * Copy 7bit characters, but skip null-bytes when we are in input
 	     * mode, so that they get converted to 0xC080.
@@ -2324,11 +2325,13 @@ UtfToUtfProc(
 
 	    *dst++ = *src++;
 	} else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd)
-		&& (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) {
+		&& (UCHAR(src[1]) == 0x80) && (flags & ENCODING_UTF) && (!(flags & ENCODING_INPUT)
+			|| ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
+			|| (flags & ENCODING_FAILINDEX))) {
 	    /*
 	     * If in input mode, and -strict or -failindex is specified: This is an error.
 	     */
-	    if (flags & TCL_ENCODING_MODIFIED) {
+	    if (flags & ENCODING_INPUT) {
 		result = TCL_CONVERT_SYNTAX;
 		break;
 	    }
@@ -2346,7 +2349,7 @@ UtfToUtfProc(
 	     * unless the user has explicitly asked to be told.
 	     */
 
-	    if (flags & TCL_ENCODING_MODIFIED) {
+	    if (flags & ENCODING_INPUT) {
 		if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) {
 		    result = TCL_CONVERT_MULTIBYTE;
 		    break;
@@ -2365,13 +2368,13 @@ UtfToUtfProc(
 	} else {
 	    const char *saveSrc = src;
 	    size_t len = TclUtfToUCS4(src, &ch);
-	    if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)
+	    if ((len < 2) && (ch != 0) && (flags & ENCODING_INPUT)
 		    && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
 		result = TCL_CONVERT_SYNTAX;
 		break;
 	    }
 	    src += len;
-	    if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) {
+	    if (!(flags & ENCODING_UTF) && (ch > 0x3FF)) {
 		if (ch > 0xFFFF) {
 		    /* CESU-8 6-byte sequence for chars > U+FFFF */
 		    ch -= 0x10000;
@@ -2393,6 +2396,11 @@ UtfToUtfProc(
 		 * A surrogate character is detected, handle especially.
 		 */
 
+		if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) && (flags & ENCODING_UTF)) {
+		    result = TCL_CONVERT_UNKNOWN;
+		    src = saveSrc;
+		    break;
+		}
 		int low = ch;
 		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
 
@@ -2409,12 +2417,12 @@ UtfToUtfProc(
 		dst += Tcl_UniCharToUtf(ch, dst);
 		ch = low;
 #endif
-	    } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch  & ~0x7FF) == 0xD800))) {
+	    } else if (STOPONERROR && !(flags & ENCODING_INPUT) && (((ch  & ~0x7FF) == 0xD800))) {
 		result = TCL_CONVERT_UNKNOWN;
 		src = saveSrc;
 		break;
 	    } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
-		    && (flags & TCL_ENCODING_MODIFIED) && ((ch  & ~0x7FF) == 0xD800)) {
+		    && (flags & ENCODING_INPUT) && ((ch  & ~0x7FF) == 0xD800)) {
 		result = TCL_CONVERT_SYNTAX;
 		src = saveSrc;
 		break;
@@ -3040,7 +3048,8 @@ TableToUtfProc(
 	    ch = pageZero[byte];
 	}
 	if ((ch == 0) && (byte != 0)) {
-	    if (STOPONERROR) {
+	    if ((flags & ENCODING_FAILINDEX)
+		    || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
 		result = TCL_CONVERT_SYNTAX;
 		break;
 	    }
diff --git a/tests/encoding.test b/tests/encoding.test
index 3fba2f3..aef5028 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -453,6 +453,24 @@ test encoding-15.24 {UtfToUtfProc CESU-8 bug [048dd20b4171c8da]} {
     binary scan $y H* z
     list [string length $y] $z
 } {2 cfbf}
+test encoding-15.25 {UtfToUtfProc CESU-8} {
+    encoding convertfrom cesu-8 \x00
+} \x00
+test encoding-15.26 {UtfToUtfProc CESU-8} {
+    encoding convertfrom cesu-8 \xC0\x80
+} \x00
+test encoding-15.27 {UtfToUtfProc -strict CESU-8} {
+    encoding convertfrom -strict cesu-8 \xC0\x80
+} \x00
+test encoding-15.28 {UtfToUtfProc -strict CESU-8} {
+    encoding convertfrom -strict cesu-8 \xC0\x80
+} \x00
+test encoding-15.29 {UtfToUtfProc CESU-8} {
+    encoding convertto cesu-8 \x00
+} \xC0\x80
+test encoding-15.30 {UtfToUtfProc -strict CESU-8} {
+    encoding convertto -strict cesu-8 \x00
+} \xC0\x80
 
 test encoding-16.1 {Utf16ToUtfProc} -body {
     set val [encoding convertfrom utf-16 NN]
@@ -585,8 +603,21 @@ test encoding-18.6 {TableToUtfProc on invalid input with -nocomplain} -body {
 	list [catch {encoding convertto -nocomplain jis0208 \\} res] $res
 } -result {0 !)}
 
-test encoding-19.1 {TableFromUtfProc} {
-} {}
+test encoding-19.1 {TableFromUtfProc} -body {
+    encoding convertfrom ascii AÁ
+} -result AÁ
+test encoding-19.2 {TableFromUtfProc} -body {
+    encoding convertfrom -nocomplain ascii AÁ
+} -result AÁ
+test encoding-19.3 {TableFromUtfProc} -body {
+    encoding convertfrom -strict ascii AÁ
+} -returnCodes 1 -result {unexpected byte sequence starting at index 1: '\xC1'}
+test encoding-19.4 {TableFromUtfProc} -body {
+    list [encoding convertfrom -failindex idx ascii AÁ] [set idx]
+} -result {A 1}
+test encoding-19.4 {TableFromUtfProc} -body {
+    list [encoding convertfrom -failindex idx -strict ascii AÁ] [set idx]
+} -result {A 1}
 
 test encoding-20.1 {TableFreefProc} {
 } {}
@@ -805,6 +836,9 @@ test encoding-24.39 {Try to generate invalid utf-8 with -strict} -body {
 test encoding-24.40 {Try to generate invalid utf-8 with -nocomplain} -body {
     encoding convertto -nocomplain utf-8 \uD800
 } -result \xED\xA0\x80
+test encoding-24.41 {Parse invalid utf-8 with -strict} -body {
+    encoding convertfrom -strict utf-8 \xED\xA0\x80\xED\xB0\x80
+} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xED'}
 
 file delete [file join [temporaryDirectory] iso2022.txt]
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2023-02-14 22:06:32 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2023-02-14 22:06:32 (GMT)
commit	fb260e838200ee3007a853db81e2e9f775ebb783 (patch)
tree	4bfe1e1a8a62a40a77ae74f373e4c10742218f0e
parent	0c306c8f81e856f8992e427ec5a1b9611a7c0434 (diff)
parent	5040a69c28a92e0318ca56498bdf30f8690470ae (diff)
download	tcl-fb260e838200ee3007a853db81e2e9f775ebb783.zip tcl-fb260e838200ee3007a853db81e2e9f775ebb783.tar.gz tcl-fb260e838200ee3007a853db81e2e9f775ebb783.tar.bz2