summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2023-02-14 20:50:26 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2023-02-14 20:50:26 (GMT)
commit38df35585000fd7245c6604e845663751a7bd524 (patch)
treeba2df03a1bbdaac0c186059f8712310934067a00
parent2974b5727951737a5b67789f4b7712cf72096ed0 (diff)
downloadtcl-38df35585000fd7245c6604e845663751a7bd524.zip
tcl-38df35585000fd7245c6604e845663751a7bd524.tar.gz
tcl-38df35585000fd7245c6604e845663751a7bd524.tar.bz2
Complete fix for [bd1a60eb9c]. Also fix a bug in the tableencoding. With testcases.
-rw-r--r--generic/tclEncoding.c24
-rw-r--r--tests/encoding.test38
2 files changed, 52 insertions, 10 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 01c4eb1..c5ecc46 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2380,7 +2380,7 @@ UtfToUtfProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) {
+ if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) {
/*
* Copy 7bit characters, but skip null-bytes when we are in input
* mode, so that they get converted to 0xC080.
@@ -2388,11 +2388,13 @@ UtfToUtfProc(
*dst++ = *src++;
} else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd)
- && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) {
+ && (UCHAR(src[1]) == 0x80) && (flags & ENCODING_UTF) && (!(flags & ENCODING_INPUT)
+ || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
+ || (flags & ENCODING_FAILINDEX))) {
/*
* If in input mode, and -strict or -failindex is specified: This is an error.
*/
- if (flags & TCL_ENCODING_MODIFIED) {
+ if (flags & ENCODING_INPUT) {
result = TCL_CONVERT_SYNTAX;
break;
}
@@ -2410,7 +2412,7 @@ UtfToUtfProc(
* unless the user has explicitly asked to be told.
*/
- if (flags & TCL_ENCODING_MODIFIED) {
+ if (flags & ENCODING_INPUT) {
if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) {
result = TCL_CONVERT_MULTIBYTE;
break;
@@ -2430,7 +2432,7 @@ UtfToUtfProc(
int low;
const char *saveSrc = src;
size_t len = TclUtfToUCS4(src, &ch);
- if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)
+ if ((len < 2) && (ch != 0) && (flags & ENCODING_INPUT)
&& (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
result = TCL_CONVERT_SYNTAX;
break;
@@ -2451,6 +2453,11 @@ UtfToUtfProc(
* A surrogate character is detected, handle especially.
*/
+ if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) && (flags & ENCODING_UTF)) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
low = ch;
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
@@ -2470,12 +2477,12 @@ UtfToUtfProc(
src += len;
dst += Tcl_UniCharToUtf(ch, dst);
ch = low;
- } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) {
+ } else if (STOPONERROR && !(flags & ENCODING_INPUT) && (((ch & ~0x7FF) == 0xD800))) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
} else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
- && (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) {
+ && (flags & ENCODING_INPUT) && ((ch & ~0x7FF) == 0xD800)) {
result = TCL_CONVERT_SYNTAX;
src = saveSrc;
break;
@@ -3117,7 +3124,8 @@ TableToUtfProc(
ch = pageZero[byte];
}
if ((ch == 0) && (byte != 0)) {
- if (STOPONERROR) {
+ if ((flags & ENCODING_FAILINDEX)
+ || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) {
result = TCL_CONVERT_SYNTAX;
break;
}
diff --git a/tests/encoding.test b/tests/encoding.test
index b2b029e..bbb40d7 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -452,6 +452,24 @@ test encoding-15.24 {UtfToUtfProc CESU-8 bug [048dd20b4171c8da]} {
binary scan $y H* z
list [string length $y] $z
} {2 cfbf}
+test encoding-15.25 {UtfToUtfProc CESU-8} {
+ encoding convertfrom cesu-8 \x00
+} \x00
+test encoding-15.26 {UtfToUtfProc CESU-8} {
+ encoding convertfrom cesu-8 \xC0\x80
+} \x00
+test encoding-15.27 {UtfToUtfProc -strict CESU-8} {
+ encoding convertfrom -strict cesu-8 \xC0\x80
+} \x00
+test encoding-15.28 {UtfToUtfProc -strict CESU-8} {
+ encoding convertfrom -strict cesu-8 \xC0\x80
+} \x00
+test encoding-15.29 {UtfToUtfProc CESU-8} {
+ encoding convertto cesu-8 \x00
+} \xC0\x80
+test encoding-15.30 {UtfToUtfProc -strict CESU-8} {
+ encoding convertto -strict cesu-8 \x00
+} \xC0\x80
test encoding-16.1 {Utf16ToUtfProc} -body {
set val [encoding convertfrom utf-16 NN]
@@ -584,8 +602,21 @@ test encoding-18.6 {TableToUtfProc on invalid input with -nocomplain} -body {
list [catch {encoding convertto -nocomplain jis0208 \\} res] $res
} -result {0 !)}
-test encoding-19.1 {TableFromUtfProc} {
-} {}
+test encoding-19.1 {TableFromUtfProc} -body {
+ encoding convertfrom ascii AÁ
+} -result AÁ
+test encoding-19.2 {TableFromUtfProc} -body {
+ encoding convertfrom -nocomplain ascii AÁ
+} -result AÁ
+test encoding-19.3 {TableFromUtfProc} -body {
+ encoding convertfrom -strict ascii AÁ
+} -returnCodes 1 -result {unexpected byte sequence starting at index 1: '\xC1'}
+test encoding-19.4 {TableFromUtfProc} -body {
+ list [encoding convertfrom -failindex idx ascii AÁ] [set idx]
+} -result {A 1}
+test encoding-19.4 {TableFromUtfProc} -body {
+ list [encoding convertfrom -failindex idx -strict ascii AÁ] [set idx]
+} -result {A 1}
test encoding-20.1 {TableFreefProc} {
} {}
@@ -804,6 +835,9 @@ test encoding-24.39 {Try to generate invalid utf-8 with -strict} -body {
test encoding-24.40 {Try to generate invalid utf-8 with -nocomplain} -body {
encoding convertto -nocomplain utf-8 \uD800
} -result \xED\xA0\x80
+test encoding-24.41 {Parse invalid utf-8 with -strict} -body {
+ encoding convertfrom -strict utf-8 \xED\xA0\x80\xED\xB0\x80
+} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xED'}
file delete [file join [temporaryDirectory] iso2022.txt]