summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2023-02-17 19:14:42 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2023-02-17 19:14:42 (GMT)
commitc731ca1ffdd3e7cc90cf064ac89b2f71551958ce (patch)
tree2ab23d68db683df8b5bced7a4bd250d57e148173
parent2c3252bc5c0a80e90ade82389f8b80faa41a6e77 (diff)
parent45796af99db14504cedf31f0336e108930482ebf (diff)
downloadtcl-c731ca1ffdd3e7cc90cf064ac89b2f71551958ce.zip
tcl-c731ca1ffdd3e7cc90cf064ac89b2f71551958ce.tar.gz
tcl-c731ca1ffdd3e7cc90cf064ac89b2f71551958ce.tar.bz2
Fix for [885c86a9a0]: convertfrom utf8 breaks for 4 byte utf encodings
-rw-r--r--generic/tclEncoding.c16
-rw-r--r--tests/encoding.test8
2 files changed, 14 insertions, 10 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 10789b1..1d3a3eb 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2417,16 +2417,14 @@ UtfToUtfProc(
result = TCL_CONVERT_MULTIBYTE;
break;
}
- if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX)) {
- result = TCL_CONVERT_SYNTAX;
- break;
- }
- ch = UCHAR(*src++);
- } else {
- char chbuf[2];
- chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
- TclUtfToUCS4(chbuf, &ch);
+ if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
}
+ char chbuf[2];
+ chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
+ TclUtfToUCS4(chbuf, &ch);
dst += Tcl_UniCharToUtf(ch, dst);
} else {
int low;
diff --git a/tests/encoding.test b/tests/encoding.test
index 50a0cc2..03f0273 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -768,7 +768,7 @@ test encoding-24.14 {Parse valid or invalid utf-8} {
} 1
test encoding-24.15 {Parse valid or invalid utf-8} -constraints deprecated -body {
encoding convertfrom utf-8 "Z\xE0\x80"
-} -result Z\xE0\x80
+} -result Z\xE0\u20AC
test encoding-24.16 {Parse valid or invalid utf-8} -constraints testbytestring -body {
encoding convertto utf-8 [testbytestring "Z\u4343\x80"]
} -returnCodes 1 -result {expected byte sequence but character 1 was '䍃€' (U+004343)}
@@ -847,6 +847,12 @@ test encoding-24.40 {Try to generate invalid utf-8 with -nocomplain} -body {
test encoding-24.41 {Parse invalid utf-8 with -strict} -body {
encoding convertfrom -strict utf-8 \xED\xA0\x80\xED\xB0\x80
} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xED'}
+test encoding-24.42 {Parse invalid utf-8, fallback to cp1252 [885c86a9a0]} -body {
+ encoding convertfrom -nocomplain utf-8 \xF0\x80\x80\x80
+} -result \xF0\u20AC\u20AC\u20AC
+test encoding-24.43 {Parse invalid utf-8, fallback to cp1252 [885c86a9a0]} -body {
+ encoding convertfrom -nocomplain utf-8 \x80
+} -result \u20AC
file delete [file join [temporaryDirectory] iso2022.txt]