summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclEncoding.c37
-rw-r--r--tests/encoding.test21
2 files changed, 35 insertions, 23 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 470f8f3..a11e696 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2412,11 +2412,11 @@ UtfToUtfProc(
*dst++ = *src++;
}
else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
- (UCHAR(src[1]) == 0x80) && (flags & ENCODING_UTF) &&
+ (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED) &&
(!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) ||
PROFILE_REPLACE(profile))) {
/* Special sequence \xC0\x80 */
- if (flags & ENCODING_INPUT) {
+ if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) {
if (PROFILE_REPLACE(profile)) {
dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
src += 2;
@@ -2485,26 +2485,28 @@ UtfToUtfProc(
}
else {
int low;
- const char *saveSrc = src;
+ int isInvalid = 0;
size_t len = TclUtfToUCS4(src, &ch);
-
- /*
- * Valid single char encodings were already handled earlier.
- * So len==1 means an invalid byte that is magically transformed
- * to a code point unless it resulted from the special
- * \xC0\x80 sequence. Tests io-75.*
- */
- if ((len < 2) && (ch != 0) && (flags & ENCODING_INPUT)) {
- if (PROFILE_STRICT(profile)) {
- result = TCL_CONVERT_SYNTAX;
- break;
- } else if (PROFILE_REPLACE(profile)) {
- ch = UNICODE_REPLACE_CHAR;
+ if (flags & ENCODING_INPUT) {
+ if ((len < 2) && (ch != 0)) {
+ isInvalid = 1;
+ } else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
+ isInvalid = 1;
+ }
+ if (isInvalid) {
+ if (PROFILE_STRICT(profile)) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ }
+ else if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
}
}
+ const char *saveSrc = src;
src += len;
- if (!(flags & ENCODING_UTF) && (ch > 0x3FF)) {
+ if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
if (ch > 0xFFFF) {
/* CESU-8 6-byte sequence for chars > U+FFFF */
ch -= 0x10000;
@@ -2670,6 +2672,7 @@ Utf32ToUtfProc(
if ((unsigned)ch > 0x10FFFF || SURROGATE(ch)) {
if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
+ ch = 0;
break;
}
if (PROFILE_REPLACE(flags)) {
diff --git a/tests/encoding.test b/tests/encoding.test
index 7b3304b..de6b87b 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -459,17 +459,20 @@ test encoding-15.26 {UtfToUtfProc CESU-8} {
encoding convertfrom cesu-8 \xC0\x80
} \x00
test encoding-15.27 {UtfToUtfProc -profile strict CESU-8} {
- encoding convertfrom -profile strict cesu-8 \xC0\x80
+ encoding convertfrom -profile strict cesu-8 \x00
} \x00
-test encoding-15.28 {UtfToUtfProc -profile strict CESU-8} {
+test encoding-15.28 {UtfToUtfProc -profile strict CESU-8} -body {
encoding convertfrom -profile strict cesu-8 \xC0\x80
-} \x00
+} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xC0'}
test encoding-15.29 {UtfToUtfProc CESU-8} {
encoding convertto cesu-8 \x00
-} \xC0\x80
+} \x00
test encoding-15.30 {UtfToUtfProc -profile strict CESU-8} {
encoding convertto -profile strict cesu-8 \x00
-} \xC0\x80
+} \x00
+test encoding-15.31 {UtfToUtfProc -profile strict CESU-8 (bytes F0-F4 are invalid)} -body {
+ encoding convertfrom -profile strict cesu-8 \xF1\x86\x83\x9C
+} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xF1'}
test encoding-16.1 {Utf16ToUtfProc} -body {
set val [encoding convertfrom utf-16 NN]
@@ -527,6 +530,9 @@ test encoding-16.15 {Utf16ToUtfProc} -body {
test encoding-16.16 {Utf16ToUtfProc} -body {
encoding convertfrom utf-16le \x00\xDC\x00\xD8
} -result \uDC00\uD800
+test encoding-16.17 {Utf32ToUtfProc} -body {
+ list [encoding convertfrom -profile strict -failindex idx utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00] [set idx]
+} -result {A 4}
test encoding-16.9 {
Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16
@@ -614,9 +620,12 @@ test encoding-19.3 {TableFromUtfProc} -body {
test encoding-19.4 {TableFromUtfProc} -body {
list [encoding convertfrom -failindex idx ascii AÁ] [set idx]
} -result [list A\xC1 -1]
-test encoding-19.4 {TableFromUtfProc} -body {
+test encoding-19.5 {TableFromUtfProc} -body {
list [encoding convertfrom -failindex idx -profile strict ascii A\xC1] [set idx]
} -result {A 1}
+test encoding-19.6 {TableFromUtfProc} -body {
+ list [encoding convertfrom -failindex idx -profile strict ascii AÁB] [set idx]
+} -result {A 1}
test encoding-20.1 {TableFreefProc} {
} {}